Sfoglia il codice sorgente

Fix digitValue() and reading of numerical escaped characters in strings. Indexing a string always returns an unsigned char value. Add PEG VM and corresponding primitive __match__().

master
Ian Piumarta 11 mesi fa
parent
commit
1ea2557f7e
1 ha cambiato i file con 496 aggiunte e 38 eliminazioni
  1. +496
    -38
      minproto.leg

+ 496
- 38
minproto.leg Vedi File

@ -1,9 +1,9 @@
# minproto.leg -- minimal prototype langauge for semantic experiments
#
# last edited: 2024-06-13 14:47:05 by piumarta on zora
# last edited: 2024-06-20 02:30:46 by piumarta on m1mbp
%{
;
;
//#define YY_DEBUG 1
#ifndef GC
@ -46,6 +46,10 @@
# define PROFILE 0
#endif
#ifndef PEGVM // include parsing expression grammar VM
# define PEGVM 1
#endif
#include <math.h>
#include <stdint.h>
#include <stdio.h>
@ -139,7 +143,7 @@ doTypes(makeProto);
doProperties(declareProp);
#undef declareProp
#define doSymbols(_) _(t) _(name) _(expr) _(function) _(arguments) _(object) _(index) _(key) _(value) _(self) _(method) _(parameters) _(body) _(lambda) _(environment) _(operation) _(full) _(condition) _(consequent) _(alternate) _(expression) _(identifier) _(initialise) _(update) _(first) _(last) _(fixed) _(keyvals) _(__namespaces__) _(O) _(d) _(p) _(v) _(statement) _(handler) _(kind) _(message) _(operand1) _(operand2) _(profile) _(parent) _(count) _(stamp) _(time) _(start) _(stop)
#define doSymbols(_) _(t) _(name) _(expr) _(function) _(arguments) _(object) _(index) _(key) _(value) _(self) _(method) _(parameters) _(body) _(lambda) _(environment) _(operation) _(full) _(condition) _(consequent) _(alternate) _(expression) _(identifier) _(initialise) _(update) _(first) _(last) _(fixed) _(keyvals) _(__namespaces__) _(O) _(d) _(p) _(v) _(statement) _(handler) _(kind) _(message) _(operand1) _(operand2) _(profile) _(parent) _(count) _(stamp) _(time) _(start) _(stop) _($$) _(yytext) _(yyleng)
#define declareSym(NAME) oop sym_##NAME = 0;
doSymbols(declareSym);
@ -168,11 +172,11 @@ int nnlrs = 0;
int maxnlrs = 0;
oop valnlr = 0;
#define nlrPush() ({ \
if (++nnlrs >= maxnlrs) nlrs = realloc(nlrs, sizeof(struct NLR) * ++maxnlrs); \
nlrs[nnlrs - 1].ntrace = _get(trace, Object,isize); \
setjmp(nlrs[nnlrs - 1].env); \
})
#define nlrPush() ({ \
if (++nnlrs >= maxnlrs) nlrs = realloc(nlrs, sizeof(struct NLR) * ++maxnlrs); \
nlrs[nnlrs - 1].ntrace = _get(trace, Object,isize); \
setjmp(nlrs[nnlrs - 1].env); \
})
#define nlrReturn(VAL, TYPE) { \
valnlr = VAL; \
@ -366,26 +370,6 @@ oop newString(char *value)
#define isString(obj) is(String, obj)
int digitValue(int digit, int base)
{
if ('a' <= digit && digit <= 'z') digit -= 'a' - 10;
else if ('A' <= digit && digit <= 'Z') digit -= 'A' - 10;
else if (digit < '0' || digit > '9') return -1;
return (digit < base) ? digit : -1;
}
int readCharValue(char **stringp, int base, int limit)
{
char *string = *stringp;
int value = 0, d = 0;
while (limit-- && *string && (d = digitValue(*string, base)) >= 0) {
++string;
value = value * base + d;
}
*stringp = string;
return value;
}
int String_length(oop str) { return get(str, String,length); }
oop String_reset (oop str) { set(str, String,length, 0); return str; }
@ -470,6 +454,27 @@ oop String_repeat(oop s, int n)
return result;
}
int digitValue(int digit, int base)
{
if ('0' <= digit && digit <= '9') digit -= '0';
else if ('a' <= digit && digit <= 'z') digit -= 'a' - 10;
else if ('A' <= digit && digit <= 'Z') digit -= 'A' - 10;
else return -1;
return (digit < base) ? digit : -1;
}
int readCharValue(char **stringp, int base, int limit)
{
char *string = *stringp;
int value = 0, d = 0;
while (limit-- && *string && (d = digitValue(*string, base)) >= 0) {
++string;
value = value * base + d;
}
*stringp = string;
return value;
}
oop newStringUnescaped(char *string)
{
oop buf = newStringLen(0, 0);
@ -489,9 +494,9 @@ oop newStringUnescaped(char *string)
case 't' : c = '\t'; break;
case 'v' : c = '\v'; break;
case 'X' :
case 'x' : c = readCharValue(&string, 16, -1); break;
case '0' : c = readCharValue(&string, 8, 3); break;
default : warning("illegal character escape sequence: \\%c", c); break;
case 'x' : c = readCharValue(&string, 16, -1); break;
case '0'...'7': --string; c = readCharValue(&string, 8, 3); break;
default : warning("illegal character escape sequence: \\%c", c); break;
}
}
String_append(buf, c);
@ -525,7 +530,6 @@ oop String_escaped(oop obj)
String_format(buf, "\\%c", c);
}
}
return buf;
}
@ -1436,7 +1440,7 @@ oop storeOn(oop buf, oop obj, int indent)
char *str = _get(obj, String,value);
int len = _get(obj, String,length);
for (int i = 0; i < len; ++i) {
int c = str[i];
int c = (unsigned char)str[i];
switch (c) {
case '\a': String_appendAll(buf, "\\a"); break;
case '\b': String_appendAll(buf, "\\b"); break;
@ -1448,7 +1452,7 @@ oop storeOn(oop buf, oop obj, int indent)
case '"': String_appendAll(buf, "\\\""); break;
case '\\': String_appendAll(buf, "\\\\"); break;
default:
if (c < ' ' || c > '~') String_format(buf, "\\%04o", c);
if (c < ' ' || c > '~') String_format(buf, "\\%03o", c);
else String_append(buf, c);
break;
}
@ -1997,13 +2001,13 @@ oop GetArray_eval(oop exp, oop env)
if (isInteger(ind)) {
int index = _integerValue(ind);
switch (getType(obj)) {
case String: return newInteger(*String_aref(obj, index));
case Symbol: return newInteger(*Symbol_aref(obj, index));
case String: return newInteger(*(unsigned char *)String_aref(obj, index));
case Symbol: return newInteger(*(unsigned char *)Symbol_aref(obj, index));
case Object: return *Object_aref(obj, index);
default: typeError("[]", "non-indexable object", obj);
}
}
if (!is(Object, obj)) typeError("[]", "non-indexable object", obj);
if (!is(Object, obj)) typeError("[]", "non-associative object", obj);
return Object_getLocal(obj, ind);
}
@ -2039,7 +2043,7 @@ oop SetArray_eval(oop exp, oop env)
}
return val;
}
if (!is(Object, obj)) typeError("[]=", "non-indexable object", obj);
if (!is(Object, obj)) typeError("[]=", "non-associative object", obj);
return Object_put(obj, ind, val);
}
@ -4686,6 +4690,459 @@ oop prim_externalCall(oop func, oop self, oop args, oop env)
return 0;
}
#if PEGVM
oop applyThunkIn(oop func, oop env)
{
int functype = getType(func);
if (Primitive == functype) {
# if PROFILE
oop profile = nil;
if (opt_p) {
profile = _get(func, Primitive,profile);
if (nil == profile) profile = _set(func, Primitive,profile, profileInit(func));
profileTick(profile);
}
# endif
oop result = _get(func, Primitive,function)(func, nil, nil, env);
# if PROFILE
if (opt_p) profileTock(profile);
# endif
return result;
}
#if PRIMCLOSURE
if (Closure != functype) valueError("()", "cannot apply", func);
oop lambda = _get(func, Closure,function);
oop body = _get(lambda, Lambda,body);
#else
if (Object != functype || pClosure != _getDelegate(func))
valueError(nil == self ? "()" : ".()", "cannot apply", func);
oop lambda = Object_get(func, sym_function);
oop parameters = Object_get(lambda, sym_parameters);
# if PROFILE
oop profile = nil;
if (opt_p) {
profile = Object_getLocal(lambda, sym_profile);
if (nil == profile) profile = Object_put(lambda, sym_profile, profileInit(lambda));
profileTick(profile);
}
# endif
#endif
oop *exprs = get(body, Object,indexed);
int size = _get(body, Object,isize);
oop result = nil;
oop args = new(pObject);
// inherit from closure's captured environment
_setDelegate(args, env);
# if NONLOCAL
switch (nlrPush()) {
case NLR_CONTINUE: syntaxError("continue outside loop");
case NLR_BREAK: syntaxError("break outside loop");
case NLR_RETURN: return nlrPop();
case NLR_RAISE: nlrReturn(nlrPop(), NLR_RAISE);
}
# endif
for (int i = 0; i < size; ++i)
result = eval(exprs[i], args);
# if NONLOCAL
nlrPop();
# endif
# if PROFILE
if (opt_p) profileTock(profile);
# endif
return result;
}
typedef unsigned char byte;
typedef enum op_t {
PUSH, DROP, POP, DOT, CLASS, STRING, RULE, CALL,
SUCCEED, FAIL, ACTION, BEGIN, END, UNEND, SET,
} op_t;
char *op_n[] = {
"PUSH", "DROP", "POP", "DOT", "CLASS", "STRING", "RULE", "CALL",
"SUCCEED", "FAIL", "ACTION", "BEGIN", "END", "UNEND", "SET",
};
typedef struct vmInsn vmInsn;
struct vmInsn {
union {
char *str;
oop obj;
vmInsn *code;
} arg;
unsigned short op, arglen, ok, ko;
};
typedef struct vmState
{
oop result;
oop variables;
} vmState;
#define VM_STATE_INITIALISER { nil, nil }
void vmEnter(vmState *state, oop obj, char *yytext, int yyleng)
{
state->variables = new(state->variables);
Object_put(state->variables, sym_$$, state->result);
}
void vmSet(vmState *state, oop obj, char *yytext, int yyleng)
{
Object_put(state->variables, obj, state->result);
}
void vmAction(vmState *state, oop obj, char *yytext, int yyleng)
{
oop text = yyleng ? newStringLen(yytext, yyleng) : nil;
Object_put(state->variables, sym_yytext, text);
Object_put(state->variables, sym_yyleng, newInteger(yyleng));
applyThunkIn(obj, state->variables);
}
void vmLeave(vmState *state, oop obj, char *yytext, int yyleng)
{
state->result = Object_getLocal(state->variables, sym_$$);
state->variables = _getDelegate(state->variables);
}
void vmDisassemble(vmInsn *code, int pc)
{
vmInsn *i = &code[pc];
printf("%p ", code);
switch (i->op) {
case CLASS:
case STRING:
printf("%03d %-7s \"%s\" %2d %2d %2d\n",
pc, op_n[i->op], i->arg.str, i->arglen, i->ok, i->ko);
break;
case CALL:
printf("%03d %-7s %p %2d %2d %2d\n",
pc, op_n[i->op], i->arg.code, i->arglen, i->ok, i->ko);
break;
default:
printf("%03d %-7s %s %2d %2d %2d\n",
pc, op_n[i->op], codeString(i->arg.obj, 0), i->arglen, i->ok, i->ko);
break;
}
}
oop vmCache = 0;
void vmCachePut(oop grammar, oop symbol, vmInsn *code)
{
oop *ref = Object_refLocal(vmCache, grammar);
if (UNDEFINED == ref) {
Object_put(vmCache, grammar, new(pObject));
ref = _refvar(vmCache, grammar);
}
oop line = *ref;
ref = Object_refLocal(line, symbol);
if (UNDEFINED != ref) fatal("vm cache collision");
Object_put(line, symbol, mkptr(code));
}
vmInsn *vmCompile(oop grammar, oop symbol)
{
oop program = Object_get(grammar, symbol);
if (!is(Object, program)) valueError("__match__", "program is not an object", program);
oop *prog = _get(program, Object,indexed);
int plen = _get(program, Object,isize);
if (plen % 5 != 0) valueError("__match__", "program length is not a multiple of five", program);
vmInsn *code = calloc(plen / 5, sizeof(*code));
vmCachePut(grammar, symbol, code);
oop env = nil;
int ppc = 0;
int cpc = 0;
while (ppc < plen) {
int op = integerValue(prog[ppc++], "__match__");
oop arg = prog[ppc++];
int arglen = integerValue(prog[ppc++], "__match__");
int ok = integerValue(prog[ppc++], "__match__");
int ko = integerValue(prog[ppc++], "__match__");
code[cpc] = (vmInsn){ .arg.obj = arg, .op = op, .arglen = arglen, .ok = ok, .ko = ko };
switch (op) {
case CLASS:
case STRING: {
code[cpc].arg.str = stringValue(code[cpc].arg.obj, "__match__");
break;
}
case RULE: {
if (!is(Symbol, code[cpc].arg.obj)) valueError("__match__", "RULE argument must be a symbol", program);
break;
}
case CALL: {
valueError("__match__", "program contains CALL opcode", program);
break;
}
case ACTION: {
if (!isClosure(code[cpc].arg.obj)) valueError("__match__", "ACTION argument is not a function", program);
break;
}
case SET: {
if (!is(Symbol, code[cpc].arg.obj)) valueError("__match__", "SET argument is not a symbol", program);
if (nil == env) env = new(pObject);
Object_put(env, code[cpc].arg.obj, nil);
break;
}
default: {
break;
}
}
++cpc;
}
#if 0
if (!cpc || code[0].op != ENTER) valueError("__match__", "program does not begin with ENTER", program);
code[0].arg.obj = env; // clone this to make a list of local variables for the rule
#endif
if (opt_d) {
printf("---- BEGIN "); println(symbol, 0);
for (int pc = 0; pc < cpc; ++pc) vmDisassemble(code, pc);
printf("---- END "); println(symbol, 0);
}
return code;
}
vmInsn *vmCacheGet(oop grammar, oop symbol)
{
oop *ref = Object_refLocal(vmCache, grammar);
if (UNDEFINED != ref) {
oop line = *ref;
ref = Object_refLocal(line, symbol);
if (UNDEFINED != ref)
return (vmInsn *)(intptr_t)_integerValue(*ref);
}
return vmCompile(grammar, symbol);
}
int vmRun(oop grammar, oop symbol, char *text, int start, int length)
{
vmCache = new(pObject);
vmInsn *code = vmCacheGet(grammar, symbol);
struct Action {
void (*function)(vmState *state, oop object, char *yytext, int yyleng);
oop object;
int textbeg, textlen;
} *actions = 0;
struct Context { // for back-tracking
int position;
int nactions;
} *cstack, context;
int csp = 0, ncstack = 32;
cstack = malloc(sizeof(*cstack) * ncstack);
context.position = start;
context.nactions = 0;
vmState state = VM_STATE_INITIALISER;
# define saveAction(ACT, OBJ, BEG, LEN) { \
actions = realloc(actions, sizeof(*actions) * (context.nactions + 1)); \
actions[context.nactions++] = (struct Action){ ACT, OBJ, BEG, LEN }; \
}
struct Frame {
oop symbol;
vmInsn *code;
int pc;
int nactions;
} *rstack, frame;
int rsp = 0, nrstack = 32;
rstack = malloc(sizeof(*rstack) * nrstack);
frame.symbol = symbol;
frame.code = vmCacheGet(grammar, symbol);
frame.pc = 0;
frame.nactions = context.nactions;
int textbeg = 0, textend = 0;
int result = 0;
#define push(C, X) { \
if (C##sp >= n##C##stack) C##stack = realloc(C##stack, sizeof(*C##stack) * (n##C##stack *= 2)); \
C##stack[C##sp++] = (X); \
}
#define drop(C) { \
assert(C##sp); \
--C##sp; \
}
#define pop(C) ({ \
assert(C##sp); \
C##stack[--C##sp]; \
})
for (;;) {
if (opt_d) vmDisassemble(frame.code, frame.pc);
vmInsn *i = frame.code + frame.pc++;
switch (i->op) {
case PUSH: push(c, context); frame.pc = i->ok; continue;
case DROP: drop(c); frame.pc = i->ok; continue;
case POP: context = pop(c); frame.pc = i->ok; continue;
case DOT: {
if (context.position < length) {
context.position++;
frame.pc = i->ok;
continue;
}
frame.pc = i->ko;
continue;
}
case CLASS: {
if (context.position < length) {
byte c = text[context.position];
if ((((byte *)i->arg.str)[c/8] >> (c % 8)) & 1) {
context.position++;
frame.pc = i->ok;
continue;
}
}
frame.pc = i->ko;
continue;
}
case STRING: {
if (context.position + i->arglen <= length) {
if (0 == memcmp(text + context.position, i->arg.str, i->arglen)) {
context.position += i->arglen;
frame.pc = i->ok;
continue;
}
}
frame.pc = i->ko;
continue;
}
case RULE: {
// frame.pc--; // save pc of call insn
// push(r, frame);
// frame.code = vmCacheGet(grammar, i->arg.obj);
// frame.pc = 0;
// if (((Node *)(i->arg))->Symbol.nvars) {
// saveAction(enter, i->arg, 0, 0);
// frame.nactions = context.nactions;
// }
// continue;
//i->op = CALL;
//i->arglen = ((Node *)i->arg)->Symbol.nvars;
//i->arg = ((Node *)i->arg)->Symbol.code; assert(i->arg);
i->op = CALL;
i->arg.code = vmCacheGet(grammar, i->arg.obj);
goto doCall; // (just in case they are not consecutive ;-))
}
case CALL: { doCall:
frame.pc--; // save pc of call insn
push(r, frame);
frame.code = i->arg.code;
frame.pc = 0;
saveAction(vmEnter, nil, 0, 0);
frame.nactions = context.nactions;
continue;
}
case SUCCEED: {
if (frame.nactions == context.nactions) // no actions were added
context.nactions--; // remove the enter action
else
saveAction(vmLeave, nil, 0, 0);
if (rsp) {
frame = pop(r);
i = frame.code + frame.pc;
frame.pc = i->ok;
continue;
}
result = context.position - start;
break;
}
case FAIL: {
if (rsp) {
frame = pop(r);
i = frame.code + frame.pc;
frame.pc = i->ko;
continue;
}
result = -1;
break;
}
case ACTION: {
// printf("--> ACTION %d [%d %d]\n", context.nactions, textbeg, textend);
saveAction(vmAction, i->arg.obj, textbeg, textend - textbeg);
frame.pc = i->ok;
continue;
}
case BEGIN: {
textbeg = textend = context.position;
frame.pc = i->ok;
continue;
}
case END: {
textend = context.position;
frame.pc = i->ok;
continue;
}
case UNEND: {
textbeg = textend = 0;
frame.pc = i->ok;
continue;
}
case SET: {
saveAction(vmSet, i->arg.obj, 0, 0);
frame.pc = i->ok;
continue;
}
default: {
fatal("this cannot happen");
break;
}
}
break;
}
saveAction(vmLeave, nil, 0, 0);
#undef pop
#undef drop
#undef push
free(cstack);
free(rstack);
for (int i = 0; i < context.nactions; ++i) {
char *yytext = text + actions[i].textbeg;
int yyleng = actions[i].textlen;
// printf("==> ACTION %d [%d %d]\n", i, actions[i].textbeg, actions[i].textbeg + actions[i].textlen);
actions[i].function(&state, actions[i].object, yytext, yyleng);
}
vmCache = nil;
return result;
}
oop prim_match(oop func, oop self, oop args, oop env)
{
oop grammar = getArg (args, 0, "__match__");
oop symbol = getArgType(args, 1, Symbol, "__match__");
oop string = getArgType(args, 2, String, "__match__");
int start = _integerValue(getArgType(args, 3, Integer, "__match__"));
char *text = get(string, String,value);
int length = get(string, String,length);
oop program = Object_get(grammar, symbol);
int result = vmRun(grammar, symbol, text, start, length);
return newInteger(result);
}
#endif // PEGVM
oop replFile(FILE *in)
{
int oldline = lineno;
@ -4855,6 +5312,7 @@ int main(int argc, char **argv)
prim(error , prim_error);
prim(defined , prim_defined);
prim(__extern__ , prim_extern);
prim(__match__ , prim_match);
# undef prim

Caricamento…
Annulla
Salva