commit 2ce78bedd128b7d3d0d0c5ff3ec7f4c7c252d40b
parent ddb02625fc684a1434cc0369fcd57440e24d251e
Author: erai <erai@omiltem.net>
Date: Thu, 12 Sep 2024 23:46:33 -0400
peg parse peg.peg into postfix notation
Diffstat:
M | bufio.c | | | 38 | +++++++++++++++++++++++++++++++++++++- |
M | cc3.peg | | | 9 | ++++----- |
M | peg.c | | | 485 | +++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------- |
M | peg.peg | | | 28 | ++++++++++++++-------------- |
4 files changed, 379 insertions(+), 181 deletions(-)
diff --git a/bufio.c b/bufio.c
@@ -158,5 +158,41 @@ fseek(f: *file, off: int) {
f.r = 0;
f.w = 0;
f.eof = 0;
- lseek(f.fd, off, 0);
+ if lseek(f.fd, off, 0) != off {
+ die("invalid seek");
+ }
+}
+
+freadall(f: *file, size: *int): *byte {
+ var i: int;
+ var cap: int;
+ var ret: *byte;
+ var tmp: *byte;
+ var ch: int;
+
+ i = 0;
+ cap = 0;
+ loop {
+ ch = fgetc(f);
+ if ch == -1 {
+ *size = i;
+ return ret;
+ }
+
+ if i == cap {
+ if cap == 0 {
+ cap = 4096;
+ ret = alloc(f.a, cap);
+ } else {
+ cap = cap * 2;
+ tmp = alloc(f.a, cap);
+ memcpy(tmp, ret, i);
+ free(f.a, ret);
+ ret = tmp;
+ }
+ }
+
+ ret[i] = ch:byte;
+ i = i + 1;
+ }
}
diff --git a/cc3.peg b/cc3.peg
@@ -35,11 +35,11 @@ if_stmt <- if expr '{' sp stmt* '}' sp
(else if expr '{' sp stmt* '}' sp)*
(else '{' sp stmt '}' sp)?
-loop_stmt <- 'loop' sp '{' sp stmt* '}' sp
+loop_stmt <- loop '{' sp stmt* '}' sp
-break_stmt <- 'break' sp ';' sp
+break_stmt <- break ';' sp
-continue_stmt <- 'continue' sp ';' sp
+continue_stmt <- continue sp ';' sp
return_stmt <- return expr? sp ';' sp
@@ -116,5 +116,4 @@ ident <- !reserved [a-zA-Z_][a-zA-Z0-9_]* sp
tc <- ![a-zA-Z0-9_] sp
-sp <- ( [ \r\n\t]
- / '//' (![\r\n] .)* )*
+sp <- ( [ \r\n\t] / '//' (![\r\n] .)* )*
diff --git a/peg.c b/peg.c
@@ -1,10 +1,29 @@
-struct compiler {
+struct peg_frame {
+ pos: int;
+ depth: int;
+ op: int;
+}
+
+struct peg_op {
+ tag: int;
+ nargs: int;
+ start: int;
+ end: int;
+}
+
+struct peg {
a: *alloc;
f: *file;
+ src: *byte;
+ size: int;
pos: int;
- stack: *int;
+ stack: *peg_frame;
sp: int;
limit: int;
+ depth: int;
+ op: int;
+ out: *peg_op;
+ cap: int;
}
enum {
@@ -12,42 +31,47 @@ enum {
OK = 1,
}
-choice(c: *compiler) {
+choice(c: *peg) {
if c.sp == c.limit {
- die("backtrack overflow");
+ die("choice overflow");
}
- c.stack[c.sp] = c.pos;
+ c.stack[c.sp].pos = c.pos;
+ c.stack[c.sp].depth = c.depth;
+ c.stack[c.sp].op = c.op;
c.sp = c.sp + 1;
}
-commit(c: *compiler) {
+commit(c: *peg) {
if c.sp == 0 {
- die("backtrack underflow");
+ die("commit underflow");
}
c.sp = c.sp - 1;
}
-fail(c: *compiler) {
+fail(c: *peg) {
if c.sp == 0 {
- die("backtrack underflow");
+ die("fail underflow");
}
c.sp = c.sp - 1;
- c.pos = c.stack[c.sp];
- fseek(c.f, c.pos);
+ c.pos = c.stack[c.sp].pos;
+ c.depth = c.stack[c.sp].depth;
+ c.op = c.stack[c.sp].op;
}
-get(c: *compiler): int {
+get(c: *peg): int {
var ch: int;
- ch = fgetc(c.f);
- if ch != -1 {
- c.pos = c.pos + 1;
+ if c.pos == c.size {
+ return -1;
}
+ ch = c.src[c.pos]:int;
+ c.pos = c.pos + 1;
+
return ch;
}
-literal(c: *compiler, s: *byte): int {
+literal(c: *peg, s: *byte): int {
var i: int;
var ch: int;
@@ -69,7 +93,82 @@ literal(c: *compiler, s: *byte): int {
return OK;
}
-charclass(c: *compiler, s: *byte): int {
+enter(c: *peg) {
+ choice(c);
+}
+
+leave(c: *peg, tag: int) {
+ var nargs: int;
+ var start: int;
+ var end: int;
+ var tmp: *byte;
+
+ commit(c);
+
+ nargs = c.depth - c.stack[c.sp].depth;
+ start = c.stack[c.sp].pos;
+ end = c.pos;
+
+ if c.op == c.cap {
+ if c.cap == 0 {
+ c.cap = 1024;
+ c.out = alloc(c.a, c.cap * sizeof(c.out[0])):*peg_op;
+ } else {
+ c.cap = c.cap * 2;
+ tmp = alloc(c.a, c.cap * sizeof(c.out[0]));
+ memcpy(tmp, c.out:*byte, c.op * sizeof(c.out[0]));
+ free(c.a, c.out:*byte);
+ c.out = tmp:*peg_op;
+ }
+ }
+
+ c.out[c.op].tag = tag;
+ c.out[c.op].nargs = nargs;
+ c.out[c.op].start = start;
+ c.out[c.op].end = end;
+
+ c.op = c.op + 1;
+ c.depth = c.depth - nargs + 1;
+}
+
+enum {
+ P_grammar = 1,
+ P_rule,
+ P_pattern,
+ P_alternative,
+ P_lookop,
+ P_lookahead,
+ P_countop,
+ P_suffix,
+ P_primary,
+ P_any,
+ P_literal,
+ P_class,
+ P_call,
+ P_identifier,
+ P_sp,
+}
+
+tag_to_str(tag: int): *byte {
+ if tag == P_grammar { return "P_grammar"; }
+ if tag == P_rule { return "P_rule"; }
+ if tag == P_pattern { return "P_pattern"; }
+ if tag == P_alternative { return "P_alternative"; }
+ if tag == P_lookop { return "P_lookop"; }
+ if tag == P_lookahead { return "P_lookahead"; }
+ if tag == P_countop { return "P_countop"; }
+ if tag == P_suffix { return "P_suffix"; }
+ if tag == P_primary { return "P_primary"; }
+ if tag == P_any { return "P_any"; }
+ if tag == P_literal { return "P_literal"; }
+ if tag == P_class { return "P_class"; }
+ if tag == P_call { return "P_call"; }
+ if tag == P_identifier { return "P_identifier"; }
+ if tag == P_sp { return "P_sp"; }
+ return "(invalid)";
+}
+
+charclass(c: *peg, s: *byte): int {
var i: int;
var ch: int;
@@ -78,21 +177,21 @@ charclass(c: *compiler, s: *byte): int {
i = 0;
loop {
if !s[i] {
- break;
+ fail(c);
+ return FAIL;
}
if ch == (s[i]:int) {
- return OK;
+ break;
}
i = i + 1;
}
- fail(c);
- return FAIL;
+ return OK;
}
-any(c: *compiler): int {
+any(c: *peg): int {
var ch: int;
ch = get(c);
if ch == -1 {
@@ -103,12 +202,16 @@ any(c: *compiler): int {
}
// grammar <- sp rule+ !.
-p_grammar(c: *compiler): int {
+p_grammar(c: *peg): int {
+ enter(c);
+
if !p_sp(c) {
+ fail(c);
return FAIL;
}
if !p_rule(c) {
+ fail(c);
return FAIL;
}
@@ -121,39 +224,55 @@ p_grammar(c: *compiler): int {
}
choice(c);
- if !any(c) {
- return OK;
+ if any(c) {
+ fail(c);
+ fail(c);
+ return FAIL;
}
- fail(c);
- fail(c);
- return FAIL;
+ leave(c, P_grammar);
+ return OK;
}
-// rule <- ident '<-' sp pattern
-p_rule(c: *compiler): int {
- if !p_ident(c) {
+// rule <- identifier sp '<-' sp pattern
+p_rule(c: *peg): int {
+ enter(c);
+
+ if !p_identifier(c) {
+ fail(c);
+ return FAIL;
+ }
+
+ if !p_sp(c) {
+ fail(c);
return FAIL;
}
if !literal(c, "<-") {
+ fail(c);
return FAIL;
}
if !p_sp(c) {
+ fail(c);
return FAIL;
}
if !p_pattern(c) {
+ fail(c);
return FAIL;
}
+ leave(c, P_rule);
return OK;
}
-// pattern <- alt ( '/' sp alt )*
-p_pattern(c: *compiler): int {
- if !p_alt(c) {
+// pattern <- alternative ( '/' !'/' sp alternative )*
+p_pattern(c: *peg): int {
+ enter(c);
+
+ if !p_alternative(c) {
+ fail(c);
return FAIL;
}
@@ -164,176 +283,192 @@ p_pattern(c: *compiler): int {
break;
}
+ choice(c);
+ if literal(c, "/") {
+ fail(c);
+ fail(c);
+ return FAIL;
+ }
+
if !p_sp(c) {
break;
}
- if !p_alt(c) {
+ if !p_alternative(c) {
break;
}
commit(c);
}
+ leave(c, P_pattern);
return OK;
}
-// preop <- [!&] sp
-p_preop(c: *compiler): int {
- if !charclass(c, "!&") {
- return FAIL;
- }
+// lookop <- [!&]
+p_lookop(c: *peg): int {
+ enter(c);
- if !p_sp(c) {
+ if !charclass(c, "!&") {
+ fail(c);
return FAIL;
}
+ leave(c, P_lookop);
return OK;
}
-// alt <- ( preop? suffix )+
-p_alt(c: *compiler): int {
- choice(c);
- if p_preop(c) {
- commit(c);
- }
+// alternative <- lookahead+
+p_alternative(c: *peg): int {
+ enter(c);
- if !p_suffix(c) {
+ if !p_lookahead(c) {
+ fail(c);
return FAIL;
}
loop {
choice(c);
- choice(c);
- if p_preop(c) {
- commit(c);
- }
-
-
- if !p_suffix(c) {
+ if !p_lookahead(c) {
break;
}
commit(c);
}
+ leave(c, P_alternative);
return OK;
}
-// postop <- [*+?] sp
-p_postop(c: *compiler): int {
- if !charclass(c, "*+?") {
+// lookahead <- (lookop sp)? suffix
+p_lookahead(c: *peg): int {
+ enter(c);
+
+ choice(c);
+ if p_lookop(c) && p_sp(c) {
+ commit(c);
+ }
+
+ if !p_suffix(c) {
+ fail(c);
return FAIL;
}
- if !p_sp(c) {
+ leave(c, P_lookahead);
+ return OK;
+}
+
+// countop <- [*+?]
+p_countop(c: *peg): int {
+ enter(c);
+
+ if !charclass(c, "*+?") {
+ fail(c);
return FAIL;
}
+ leave(c, P_countop);
return OK;
}
-// suffix <- primary postop*
-p_suffix(c: *compiler): int {
+// suffix <- primary (countop sp)*
+p_suffix(c: *peg): int {
+ enter(c);
+
if !p_primary(c) {
+ fail(c);
return FAIL;
}
loop {
choice(c);
- if !p_postop(c) {
+ if !p_countop(c) {
+ break;
+ }
+
+ if !p_sp(c) {
break;
}
commit(c);
}
+ leave(c, P_suffix);
return OK;
}
-// primary <- group / any / literal / charclass / nonterminal
-p_primary(c: *compiler): int {
- choice(c);
- if p_group(c) {
- commit(c);
- return OK;
- }
-
- choice(c);
- if p_any(c) {
- commit(c);
- return OK;
- }
+// primary <- ( '(' sp pattern ')' / any / literal / class / call ) sp
+p_primary(c: *peg): int {
+ enter(c);
- choice(c);
- if p_literal(c) {
- commit(c);
- return OK;
- }
-
- choice(c);
- if p_charclass(c) {
- commit(c);
- return OK;
- }
-
- choice(c);
- if p_nonterminal(c) {
- commit(c);
- return OK;
- }
+ loop {
+ choice(c);
+ if literal(c, "(") && p_sp(c) && p_pattern(c) && literal(c, ")") {
+ commit(c);
+ break;
+ }
- fail(c);
- return FAIL;
-}
+ choice(c);
+ if p_any(c) {
+ commit(c);
+ break;
+ }
-// group <- '(' sp pattern ')' sp
-p_group(c: *compiler): int {
- if !literal(c, "(") {
- return FAIL;
- }
+ choice(c);
+ if p_literal(c) {
+ commit(c);
+ break;
+ }
- if !p_sp(c) {
- return FAIL;
- }
+ choice(c);
+ if p_class(c) {
+ commit(c);
+ break;
+ }
- if !p_pattern(c) {
- return FAIL;
- }
+ choice(c);
+ if p_call(c) {
+ commit(c);
+ break;
+ }
- if !literal(c, ")") {
+ fail(c);
+ fail(c);
return FAIL;
}
if !p_sp(c) {
+ fail(c);
return FAIL;
}
+ leave(c, P_primary);
return OK;
}
-// any <- '.' sp
-p_any(c: *compiler): int {
- if !literal(c, ".") {
- return FAIL;
- }
+// any <- '.'
+p_any(c: *peg): int {
+ enter(c);
- if !p_sp(c) {
+ if !literal(c, ".") {
+ fail(c);
return FAIL;
}
+ leave(c, P_any);
return OK;
}
-// literal <- ['] ( !['] . )* ['] sp
-p_literal(c: *compiler): int {
- choice(c);
+// literal <- ['] ( !['] . )* [']
+p_literal(c: *peg): int {
+ enter(c);
+
if !literal(c, "'") {
+ fail(c);
return FAIL;
}
- commit(c);
loop {
choice(c);
@@ -352,26 +487,23 @@ p_literal(c: *compiler): int {
commit(c);
}
- choice(c);
if !literal(c, "'") {
- return FAIL;
- }
- commit(c);
-
- if !p_sp(c) {
+ fail(c);
return FAIL;
}
+ leave(c, P_literal);
return OK;
}
-// charclass <- '[' ( !']' ( . '-' . / . ) )* ']' sp
-p_charclass(c: *compiler): int {
- choice(c);
+// charclass <- '[' ( !']' ( . '-' . / . ) )* ']'
+p_class(c: *peg): int {
+ enter(c);
+
if !literal(c, "[") {
+ fail(c);
return FAIL;
}
- commit(c);
loop {
choice(c);
@@ -388,50 +520,52 @@ p_charclass(c: *compiler): int {
}
choice(c);
- if literal(c, "-") {
- if any(c) {
- commit(c);
- }
+ if literal(c, "-") && any(c) {
+ commit(c);
}
commit(c);
}
- choice(c);
if !literal(c, "]") {
- return FAIL;
- }
- commit(c);
-
- if !p_sp(c) {
+ fail(c);
return FAIL;
}
+ leave(c, P_class);
return OK;
}
-// nonterminal <- ident !'<-'
-p_nonterminal(c: *compiler): int {
- if !p_ident(c) {
+// call <- identifier !'<-'
+p_call(c: *peg): int {
+ enter(c);
+
+ if !p_identifier(c) {
+ fail(c);
return FAIL;
}
choice(c);
- if !literal(c, "<-") {
- return OK;
+ if p_sp(c) && literal(c, "<-") {
+ fail(c);
+ fail(c);
+ fail(c);
+ return FAIL;
}
- fail(c);
- fail(c);
- return FAIL;
+
+ leave(c, P_call);
+ return OK;
}
-// ident <- [a-zA-Z]+ sp
-p_ident(c: *compiler): int {
+// identifier <- [a-zA-Z0-9_]+
+p_identifier(c: *peg): int {
var chars: *byte;
+ enter(c);
- chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+ chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
if !charclass(c, chars) {
+ fail(c);
return FAIL;
}
@@ -445,15 +579,14 @@ p_ident(c: *compiler): int {
commit(c);
}
- if !p_sp(c) {
- return FAIL;
- }
-
+ leave(c, P_identifier);
return OK;
}
-// sp <- ( [ \t\r\n] / '#' ( ![\r\n] . )* )*
-p_sp(c: *compiler): int {
+// sp <- ( [ \t\r\n] / '//' ( ![\r\n] . )* )*
+p_sp(c: *peg): int {
+ enter(c);
+
loop {
choice(c);
@@ -465,9 +598,7 @@ p_sp(c: *compiler): int {
}
choice(c);
- if literal(c, "#") {
- commit(c);
-
+ if literal(c, "//") {
loop {
choice(c);
@@ -486,25 +617,29 @@ p_sp(c: *compiler): int {
}
commit(c);
+ commit(c);
continue;
}
+ fail(c);
break;
}
+ leave(c, P_sp);
return OK;
}
main(argc: int, argv: **byte, envp: **byte) {
var fd: int;
var a: alloc;
- var c: compiler;
+ var c: peg;
+ var i: int;
setup_alloc(&a);
c.a = &a;
c.pos = 0;
c.limit = 1024;
- c.stack = alloc(c.a, c.limit * sizeof(c.stack[0])):*int;
+ c.stack = alloc(c.a, c.limit * sizeof(c.stack[0])):*peg_frame;
if argc != 2 {
die("usage: ./peg <grammar.peg>");
@@ -517,8 +652,36 @@ main(argc: int, argv: **byte, envp: **byte) {
c.f = fopen(fd, c.a);
+ c.src = freadall(c.f, &c.size);
+
+ choice(&c);
if !p_grammar(&c) {
- die("FAIL");
+ die("Syntax error");
+ }
+ commit(&c);
+
+ i = 0;
+ loop {
+ if i == c.op {
+ break;
+ }
+
+ if c.out[i].nargs == 0 {
+ fdputs(1, tag_to_str(c.out[i].tag));
+ fdputc(1, ' ');
+ fdputc(1, '"');
+ write(1, &c.src[c.out[i].start], c.out[i].end - c.out[i].start);
+ fdputc(1, '"');
+ fdputc(1, '\n');
+ }
+
+ if c.out[i].nargs != 0 {
+ fdputs(1, tag_to_str(c.out[i].tag));
+ fdputc(1, ' ');
+ fdputd(1, c.out[i].nargs);
+ fdputc(1, '\n');
+ }
+
+ i = i + 1;
}
- fdputs(1, "OK\n");
}
diff --git a/peg.peg b/peg.peg
@@ -1,15 +1,15 @@
grammar <- sp rule+ !.
-rule <- ident '<-' sp pattern
-pattern <- alt ( '/' sp alt )*
-preop <- [!&] sp
-alt <- ( preop? suffix )+
-postop <- [*+?] sp
-suffix <- primary postop*
-primary <- group / any / literal / charclass / nonterminal
-group <- '(' sp pattern ')' sp
-any <- '.' sp
-literal <- ['] ( !['] . )* ['] sp
-charclass <- '[' ( !']' ( . '-' . / . ) )* ']' sp
-nonterminal <- ident !'<-'
-ident <- [a-zA-Z]+ sp
-sp <- ( [ \t\r\n] / '#' ( ![\r\n] . )* )*
+rule <- identifier sp '<-' sp pattern
+pattern <- alternative ( '/' !'/' sp alternative )*
+alternative <- lookahead+
+lookop <- [!&]
+lookahead <- (lookop sp)? suffix
+countop <- [*+?]
+suffix <- primary (countop sp)*
+primary <- ( '(' sp pattern ')' / any / literal / class / call ) sp
+any <- '.'
+literal <- ['] ( !['] . )* [']
+class <- '[' ( !']' ( . '-' . / . ) )* ']'
+call <- identifier !(sp '<-')
+identifier <- [a-zA-Z0-9_]+
+sp <- ( [ \t\r\n] / '//' ( ![\r\n] . )* )*