os

An operating system
git clone https://erai.gay/code/os/
Log | Files | Refs | README | LICENSE

commit f9c585acac4425598b1b0b0e739d1a33c22537c3
parent 1dde52de096e9a61c09328e7965a7638f8bba18d
Author: erai <erai@omiltem.net>
Date:   Sun,  6 Apr 2025 01:19:15 +0000

compile lexer to nfa

Diffstat:
Mbootstrap.sh | 2+-
Mcc0.c | 1029+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Mcc3.om | 21+++++++++++----------
Mlexer.om | 414++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mlib.om | 8++++++++
5 files changed, 1367 insertions(+), 107 deletions(-)

diff --git a/bootstrap.sh b/bootstrap.sh @@ -9,6 +9,6 @@ SOURCES="cc1.om type.om parse2.om peglib.om as.om decl.om node.om peg.om ir.om i ./cc0 ${LIBS} ${SOURCES} cc3.om -o cc1 -n cc1.lines -G cc1.call # Double check the bootstrap and self hosting compiler have the same output -./cc1 ${LIBS} ${SOURCES} cc3.om -C cc2.c -o cc2 -n cc2.lines -G cc2.call +./cc1 ${LIBS} ${SOURCES} cc3.om cc4.om -C cc2.c -o cc2 -n cc2.lines -G cc2.call cmp cc1 cc2 || echo cc mismatch cmp cc0.c cc2.c || echo bootstrap mismatch diff --git a/cc0.c b/cc0.c @@ -130,9 +130,19 @@ u zirreset(); u zirreturn(); u ziruseop(); u zlabels_to_ir(); +u zlalr_compiler(); u zlayout_struct(); u zlayout_union(); u zleave(); +u zlexer_compile(); +u zlexer_compile_alternative(); +u zlexer_compile_any(); +u zlexer_compile_charset(); +u zlexer_compile_pattern(); +u zlexer_compile_primary(); +u zlexer_compile_rule(); +u zlexer_compile_str(); +u zlexer_compile_suffix(); u zliteral(); u zlocals_to_ir(); u zmain(); @@ -167,6 +177,14 @@ u zmktype_struct(); u zmktype_union(); u zmmap(); u znext_decl(); +u znfa_alt(); +u znfa_any(); +u znfa_concat(); +u znfa_empty(); +u znfa_lit(); +u znfa_plus(); +u znfa_qmark(); +u znfa_star(); u znode_to_str(); u zopen(); u zopen_call_out(); @@ -251,6 +269,7 @@ u zpeg_P_lexer_op(); u zpeg_P_lexer_pattern(); u zpeg_P_lexer_primary(); u zpeg_P_lexer_rule(); +u zpeg_P_lexer_str(); u zpeg_P_lexer_suffix(); u zpeg_P_loop(); u zpeg_P_loop_stmt(); @@ -609,38 +628,40 @@ b210: if (vtag != 105UL) goto b212; b212: if (vtag != 106UL) goto b214; return (u)"P_peg_identifier"; b214: if (vtag != 107UL) goto b216; - return (u)"P_lalr_primary"; + return (u)"P_lexer_dot"; b216: if (vtag != 108UL) goto b218; - return (u)"P_lalr_op"; + return (u)"P_lexer_op"; b218: if (vtag != 109UL) goto b220; - return (u)"P_lalr_suffix"; + return (u)"P_lexer_str"; b220: if (vtag != 110UL) goto b222; - return (u)"P_lalr_alternative"; + return (u)"P_lexer_charset"; b222: if (vtag != 111UL) goto b224; - return (u)"P_lalr_pattern"; + return (u)"P_lexer_primary"; b224: if (vtag != 112UL) goto b226; - return (u)"P_lalr_rule"; + return (u)"P_lexer_suffix"; b226: if (vtag != 113UL) goto b228; - return (u)"P_lalr_grammar"; + return (u)"P_lexer_alternative"; b228: if (vtag != 114UL) goto b230; - return (u)"P_lexer_dot"; + return (u)"P_lexer_pattern"; b230: if (vtag != 115UL) goto b232; - return (u)"P_lexer_op"; + return (u)"P_lexer_rule"; b232: if (vtag != 116UL) goto b234; - return (u)"P_lexer_charset"; + return (u)"P_lexer_grammar"; b234: if (vtag != 117UL) goto b236; - return (u)"P_lexer_primary"; + return (u)"P_lalr_primary"; b236: if (vtag != 118UL) goto b238; - return (u)"P_lexer_suffix"; + return (u)"P_lalr_op"; b238: if (vtag != 119UL) goto b240; - return (u)"P_lexer_alternative"; + return (u)"P_lalr_suffix"; b240: if (vtag != 120UL) goto b242; - return (u)"P_lexer_pattern"; + return (u)"P_lalr_alternative"; b242: if (vtag != 121UL) goto b244; - return (u)"P_lexer_rule"; + return (u)"P_lalr_pattern"; b244: if (vtag != 122UL) goto b246; - return (u)"P_lexer_grammar"; -b246: return 0UL; + return (u)"P_lalr_rule"; +b246: if (vtag != 123UL) goto b248; + return (u)"P_lalr_grammar"; +b248: return 0UL; } u z_start(u vargc, u vargv, u venvp) { u v3 = 0; @@ -22476,6 +22497,9 @@ b34: goto b7; b5: v4 = 1UL; goto b6; } +u zlalr_compiler(u vc, u vpn, u verr) { + return 0UL; +} u zlayout_struct(u vc, u vd) { u vm = 0; u voffset = 0; @@ -22721,6 +22745,504 @@ b7: *(u*)(vc + 168UL) = *(u*)(vc + 168UL) * 2UL; goto b5; b4: goto b2; } +u zlexer_compile(u vc, u vpn, u verr) { + u va = 0; + u vb = 0; + u v5 = 0; + u v6 = 0; + u v7 = 0; + u v8 = 0; + u v9 = 0; + u v10 = 0; + u v11 = 0; + u v12 = 0; + u v13 = 0; + u v14 = 0; + vpn = *(u*)(vpn + 16UL); +b1: if (!vpn) goto b7; + v5 = 0UL; +b8: if (!v5) goto b5; + return 0UL; +b5: v6 = (u)zlexer_compile_rule; + v7 = vc; + v8 = vpn; + v9 = ((u(*)())v6)(v7, v8); + vb = v9; + v10 = (u)znfa_alt; + v11 = vc; + v12 = va; + v13 = vb; + v14 = ((u(*)())v10)(v11, v12, v13); + va = v14; + vpn = *(u*)(vpn + 8UL); + goto b1; +b7: v5 = 1UL; + goto b8; +} +u zlexer_compile_alternative(u vc, u vpn) { + u vpart = 0; + u va = 0; + u vb = 0; + u v5 = 0; + u v6 = 0; + u v7 = 0; + u v8 = 0; + u v9 = 0; + u v10 = 0; + u v11 = 0; + u v12 = 0; + u v13 = 0; + u v14 = 0; + u v15 = 0; + u v16 = 0; + u v17 = 0; + vpart = *(u*)(vpn + 16UL); + v5 = (u)znfa_empty; + v6 = vc; + v7 = ((u(*)())v5)(v6); + va = v7; +b2: if (!vpart) goto b8; + v8 = 0UL; +b9: if (!v8) goto b6; + return va; +b6: v9 = (u)zlexer_compile_suffix; + v10 = vc; + v11 = vpart; + v12 = ((u(*)())v9)(v10, v11); + vb = v12; + v13 = (u)znfa_concat; + v14 = vc; + v15 = va; + v16 = vb; + v17 = ((u(*)())v13)(v14, v15, v16); + va = v17; + vpart = *(u*)(vpart + 8UL); + goto b2; +b8: v8 = 1UL; + goto b9; +} +u zlexer_compile_any(u vc, u vpn) { + u v2 = 0; + u v3 = 0; + u v4 = 0; + v2 = (u)znfa_any; + v3 = vc; + v4 = ((u(*)())v2)(v3); + return v4; +} +u zlexer_compile_charset(u vc, u vpn) { + u vi = 0; + u vj = 0; + u vlen = 0; + u vch = 0; + u vx = 0; + u vok = 0; + u vs = 0; + u vscratch = 0; + u vstart = 0; + u vend = 0; + u va = 0; + u vb = 0; + u v14 = 0; + u v15 = 0; + u v16 = 0; + u v17 = 0; + u v18 = 0; + u v19 = 0; + u v20 = 0; + u v21 = 0; + u v22 = 0; + u v23 = 0; + u v24 = 0; + u v25 = 0; + u v26 = 0; + u v27 = 0; + u v28 = 0; + u v29 = 0; + u v30 = 0; + u v31 = 0; + u v32 = 0; + u v33 = 0; + u v34 = 0; + u v35 = 0; + u v36 = 0; + u v37 = 0; + u v38 = 0; + u v39 = 0; + u v40 = 0; + u v41 = 0; + u v42 = 0; + u v43 = 0; + u v44 = 0; + u v45 = 0; + u v46 = 0; + u v47 = 0; + u v48 = 0; + u v49 = 0; + u v50 = 0; + u v51 = 0; + v14 = (u)zalloc; + v15 = *(u*)(vc + 0UL); + v16 = 256UL; + v17 = ((u(*)())v14)(v15, v16); + vscratch = v17; + vs = *(u*)(vpn + 24UL); + vi = 2UL; + vlen = *(u*)(vpn + 32UL) - 2UL; + vx = 1UL; +b2: if (vi != vlen) goto b6; + vj = 0UL; +b40: if (vj != 256UL) goto b44; + v48 = (u)zfree; + v49 = *(u*)(vc + 0UL); + v50 = vscratch; + v51 = ((u(*)())v48)(v49, v50); + v51; + return va; +b44: if (!(u)*(b*)(vscratch + vj * 1UL)) goto b47; + v39 = (u)znfa_lit; + v40 = vc; + v41 = vj; + v42 = ((u(*)())v39)(v40, v41); + vb = v42; + v43 = (u)znfa_alt; + v44 = vc; + v45 = va; + v46 = vb; + v47 = ((u(*)())v43)(v44, v45, v46); + va = v47; +b45: vj = vj + 1UL; + goto b40; +b47: goto b45; +b6: vch = (u)*(b*)(vs + vi * 1UL); + if (vch != 92UL) goto b9; + v18 = (u)zunescape; + v19 = vs; + v20 = (u)&vi; + v21 = vlen; + v22 = (u)&vok; + v23 = ((u(*)())v18)(v19, v20, v21, v22); + vch = v23; + if (!vok) goto b15; + v24 = 0UL; +b16: if (!v24) goto b13; + v25 = (u)zdie; + v26 = (u)"lexer_compile_charset: invalid escape"; + v27 = ((u(*)())v25)(v26); + v27; +b11:b7: vstart = vch; + vend = vch; + if (vi == vlen) goto b24; + if ((u)*(b*)(vs + vi * 1UL) != 45UL) goto b24; + v28 = 1UL; +b26: if (!v28) goto b22; + vi = vi + 1UL; + v29 = (u)zunescape; + v30 = vs; + v31 = (u)&vi; + v32 = vlen; + v33 = (u)&vok; + v34 = ((u(*)())v29)(v30, v31, v32, v33); + vch = v34; + if (!vok) goto b32; + v35 = 0UL; +b33: if (!v35) goto b30; + v36 = (u)zdie; + v37 = (u)"lexer_compile_charset: invalid escape"; + v38 = ((u(*)())v36)(v37); + v38; +b28: vend = vch; +b20: vj = vstart; +b35: if ((s)vj <= (s)vend) goto b39; + goto b2; +b39: *(b*)(vscratch + vj * 1UL) = vx; + vj = vj + 1UL; + goto b35; +b30: goto b28; +b32: v35 = 1UL; + goto b33; +b22: goto b20; +b24: v28 = 0UL; + goto b26; +b13: goto b11; +b15: v24 = 1UL; + goto b16; +b9: if (vch != 94UL) goto b19; + vx = 0UL; + vi = vi + 1UL; + goto b2; +b19: vi = vi + 1UL; + goto b7; +} +u zlexer_compile_pattern(u vc, u vpn) { + u valt = 0; + u va = 0; + u vb = 0; + u v5 = 0; + u v6 = 0; + u v7 = 0; + u v8 = 0; + u v9 = 0; + u v10 = 0; + u v11 = 0; + u v12 = 0; + u v13 = 0; + u v14 = 0; + valt = *(u*)(vpn + 16UL); + va = 0UL; +b1: if (!valt) goto b7; + v5 = 0UL; +b8: if (!v5) goto b5; + return va; +b5: v6 = (u)zlexer_compile_alternative; + v7 = vc; + v8 = valt; + v9 = ((u(*)())v6)(v7, v8); + vb = v9; + v10 = (u)znfa_alt; + v11 = vc; + v12 = va; + v13 = vb; + v14 = ((u(*)())v10)(v11, v12, v13); + va = v14; + valt = *(u*)(valt + 8UL); + goto b1; +b7: v5 = 1UL; + goto b8; +} +u zlexer_compile_primary(u vc, u vpn) { + u vtag = 0; + u v3 = 0; + u v4 = 0; + u v5 = 0; + u v6 = 0; + u v7 = 0; + u v8 = 0; + u v9 = 0; + u v10 = 0; + u v11 = 0; + u v12 = 0; + u v13 = 0; + u v14 = 0; + u v15 = 0; + u v16 = 0; + u v17 = 0; + u v18 = 0; + u v19 = 0; + u v20 = 0; + u v21 = 0; + vpn = *(u*)(vpn + 16UL); + vtag = *(u*)(vpn + 0UL); + if (vtag != 114UL) goto b3; + v3 = (u)zlexer_compile_pattern; + v4 = vc; + v5 = vpn; + v6 = ((u(*)())v3)(v4, v5); + return v6; +b3: if (vtag != 107UL) goto b6; + v7 = (u)zlexer_compile_any; + v8 = vc; + v9 = vpn; + v10 = ((u(*)())v7)(v8, v9); + return v10; +b6: if (vtag != 109UL) goto b9; + v11 = (u)zlexer_compile_str; + v12 = vc; + v13 = vpn; + v14 = ((u(*)())v11)(v12, v13); + return v14; +b9: if (vtag != 110UL) goto b12; + v15 = (u)zlexer_compile_charset; + v16 = vc; + v17 = vpn; + v18 = ((u(*)())v15)(v16, v17); + return v18; +b12: v19 = (u)zdie; + v20 = (u)"invalid lexer_primary"; + v21 = ((u(*)())v19)(v20); + v21; + return 0UL; +} +u zlexer_compile_rule(u vc, u vpn) { + u vident = 0; + u vpat = 0; + u va = 0; + u vb = 0; + u v6 = 0; + u v7 = 0; + u v8 = 0; + u v9 = 0; + u v10 = 0; + u v11 = 0; + u v12 = 0; + u v13 = 0; + u v14 = 0; + u v15 = 0; + u v16 = 0; + u v17 = 0; + vident = *(u*)(vpn + 16UL); + vpat = *(u*)(vident + 8UL); + v6 = (u)zlexer_compile_pattern; + v7 = vc; + v8 = vpat; + v9 = ((u(*)())v6)(v7, v8); + va = v9; + v10 = (u)znfa_empty; + v11 = vc; + v12 = ((u(*)())v10)(v11); + vb = v12; + *(u*)(vb + 0UL) = 1UL; + v13 = (u)znfa_concat; + v14 = vc; + v15 = va; + v16 = vb; + v17 = ((u(*)())v13)(v14, v15, v16); + return v17; +} +u zlexer_compile_str(u vc, u vpn) { + u vi = 0; + u vlen = 0; + u vok = 0; + u vch = 0; + u va = 0; + u vb = 0; + u v8 = 0; + u v9 = 0; + u v10 = 0; + u v11 = 0; + u v12 = 0; + u v13 = 0; + u v14 = 0; + u v15 = 0; + u v16 = 0; + u v17 = 0; + u v18 = 0; + u v19 = 0; + u v20 = 0; + u v21 = 0; + u v22 = 0; + u v23 = 0; + u v24 = 0; + u v25 = 0; + u v26 = 0; + u v27 = 0; + u v28 = 0; + u v29 = 0; + vi = 1UL; + vlen = *(u*)(vpn + 32UL) - 1UL; + v8 = (u)znfa_empty; + v9 = vc; + v10 = ((u(*)())v8)(v9); + va = v10; +b2: if (vi != vlen) goto b6; + return va; +b6: v11 = (u)zunescape; + v12 = *(u*)(vpn + 24UL); + v13 = (u)&vi; + v14 = vlen; + v15 = (u)&vok; + v16 = ((u(*)())v11)(v12, v13, v14, v15); + vch = v16; + if (!vok) goto b12; + v17 = 0UL; +b13: if (!v17) goto b10; + v18 = (u)zdie; + v19 = (u)"lexer_compile_str: invalid escape"; + v20 = ((u(*)())v18)(v19); + v20; +b8: v21 = (u)znfa_lit; + v22 = vc; + v23 = vch; + v24 = ((u(*)())v21)(v22, v23); + vb = v24; + v25 = (u)znfa_concat; + v26 = vc; + v27 = va; + v28 = vb; + v29 = ((u(*)())v25)(v26, v27, v28); + va = v29; + goto b2; +b10: goto b8; +b12: v17 = 1UL; + goto b13; +} +u zlexer_compile_suffix(u vc, u vpn) { + u vprimary = 0; + u vop = 0; + u vn = 0; + u vch = 0; + u vzero = 0; + u vmore = 0; + u v8 = 0; + u v9 = 0; + u v10 = 0; + u v11 = 0; + u v12 = 0; + u v13 = 0; + u v14 = 0; + u v15 = 0; + u v16 = 0; + u v17 = 0; + u v18 = 0; + u v19 = 0; + u v20 = 0; + u v21 = 0; + u v22 = 0; + u v23 = 0; + u v24 = 0; + u v25 = 0; + vprimary = *(u*)(vpn + 16UL); + v8 = (u)zlexer_compile_primary; + v9 = vc; + v10 = vprimary; + v11 = ((u(*)())v8)(v9, v10); + vn = v11; + vzero = 0UL; + vmore = 0UL; + vop = *(u*)(vprimary + 8UL); +b2: if (!vop) goto b8; + v12 = 0UL; +b9: if (!v12) goto b6; + if (!vzero) goto b21; + if (!vmore) goto b21; + v13 = 1UL; +b23: if (!v13) goto b19; + v14 = (u)znfa_star; + v15 = vc; + v16 = vn; + v17 = ((u(*)())v14)(v15, v16); + return v17; +b19: if (!vzero) goto b26; + v18 = (u)znfa_qmark; + v19 = vc; + v20 = vn; + v21 = ((u(*)())v18)(v19, v20); + return v21; +b26: if (!vmore) goto b29; + v22 = (u)znfa_plus; + v23 = vc; + v24 = vn; + v25 = ((u(*)())v22)(v23, v24); + return v25; +b29: return vn; +b21: v13 = 0UL; + goto b23; +b6: vch = (u)*(b*)(*(u*)(vop + 24UL) + 0UL * 1UL); + if (vch != 42UL) goto b12; + vzero = 1UL; + vmore = 1UL; +b10: vop = *(u*)(vop + 8UL); + goto b2; +b12: if (vch != 43UL) goto b14; + vmore = 1UL; + goto b10; +b14: if (vch != 63UL) goto b16; + vzero = 1UL; + goto b10; +b16: goto b10; +b8: v12 = 1UL; + goto b9; +} u zliteral(u vc, u vs) { u vi = 0; u vch = 0; @@ -23968,6 +24490,296 @@ u znext_decl(u vc, u vd) { v4 = ((u(*)())v2)(v3); return v4; } +u znfa_alt(u vc, u va, u vb) { + u vret = 0; + u vn = 0; + u vtail = 0; + u vi = 0; + u vj = 0; + u v8 = 0; + u v9 = 0; + u v10 = 0; + u v11 = 0; + u v12 = 0; + u v13 = 0; + u v14 = 0; + u v15 = 0; + u v16 = 0; + u v17 = 0; + u v18 = 0; + u v19 = 0; + u v20 = 0; + u v21 = 0; + u v22 = 0; + u v23 = 0; + u v24 = 0; + u v25 = 0; + if (!va) goto b5; + v8 = 0UL; +b6: if (!v8) goto b3; + return vb; +b3: if (!vb) goto b11; + v9 = 0UL; +b12: if (!v9) goto b9; + return va; +b9: v10 = (u)zalloc; + v11 = *(u*)(vc + 0UL); + v12 = 48UL; + v13 = ((u(*)())v10)(v11, v12); + vn = v13; + *(u*)(vn + 8UL) = -1UL; + *(u*)(vn + 16UL) = va; + *(u*)(vn + 24UL) = vb; + *(u*)(vn + 40UL) = *(u*)(va + 40UL) + *(u*)(vb + 40UL); + v14 = (u)zalloc; + v15 = *(u*)(vc + 0UL); + v16 = 8UL * *(u*)(vn + 40UL); + v17 = ((u(*)())v14)(v15, v16); + *(u*)(vn + 32UL) = v17; + vi = 0UL; + vj = 0UL; +b15: if (vi != *(u*)(va + 40UL)) goto b19; +b20: if (vi != *(u*)(vb + 40UL)) goto b24; + v18 = (u)zfree; + v19 = *(u*)(vc + 0UL); + v20 = *(u*)(va + 32UL); + v21 = ((u(*)())v18)(v19, v20); + v21; + v22 = (u)zfree; + v23 = *(u*)(vc + 0UL); + v24 = *(u*)(vb + 32UL); + v25 = ((u(*)())v22)(v23, v24); + v25; + return vret; +b24: *(u*)(*(u*)(vn + 32UL) + vj * 8UL) = *(u*)(*(u*)(vb + 32UL) + vi * 8UL); + vj = vj + 1UL; + vi = vi + 1UL; + goto b20; +b19: *(u*)(*(u*)(vn + 32UL) + vj * 8UL) = *(u*)(*(u*)(va + 32UL) + vi * 8UL); + vj = vj + 1UL; + vi = vi + 1UL; + goto b15; +b11: v9 = 1UL; + goto b12; +b5: v8 = 1UL; + goto b6; +} +u znfa_any(u vc) { + u v1 = 0; + u v2 = 0; + u v3 = 0; + u v4 = 0; + v1 = (u)znfa_lit; + v2 = vc; + v3 = -1UL; + v4 = ((u(*)())v1)(v2, v3); + return v4; +} +u znfa_concat(u vc, u va, u vb) { + u vi = 0; + u v4 = 0; + u v5 = 0; + u v6 = 0; + u v7 = 0; + u v8 = 0; + u v9 = 0; + if (!va) goto b5; + v4 = 0UL; +b6: if (!v4) goto b3; + return 0UL; +b3: if (!vb) goto b11; + v5 = 0UL; +b12: if (!v5) goto b9; + return 0UL; +b9: vi = 0UL; +b13: if (vi != *(u*)(va + 40UL)) goto b17; + v6 = (u)zfree; + v7 = *(u*)(vc + 0UL); + v8 = *(u*)(va + 32UL); + v9 = ((u(*)())v6)(v7, v8); + v9; + *(u*)(va + 32UL) = *(u*)(vb + 32UL); + *(u*)(va + 40UL) = *(u*)(vb + 40UL); + return va; +b17: *(u*)(*(u*)(*(u*)(va + 32UL) + vi * 8UL) + 16UL) = vb; + vi = vi + 1UL; + goto b13; +b11: v5 = 1UL; + goto b12; +b5: v4 = 1UL; + goto b6; +} +u znfa_empty(u vc) { + u vn = 0; + u v2 = 0; + u v3 = 0; + u v4 = 0; + u v5 = 0; + u v6 = 0; + u v7 = 0; + u v8 = 0; + u v9 = 0; + v2 = (u)zalloc; + v3 = *(u*)(vc + 0UL); + v4 = 48UL; + v5 = ((u(*)())v2)(v3, v4); + vn = v5; + *(u*)(vn + 8UL) = -1UL; + *(u*)(vn + 16UL) = 0UL; + *(u*)(vn + 24UL) = 0UL; + v6 = (u)zalloc; + v7 = *(u*)(vc + 0UL); + v8 = 8UL; + v9 = ((u(*)())v6)(v7, v8); + *(u*)(vn + 32UL) = v9; + *(u*)(vn + 40UL) = 1UL; + *(u*)(*(u*)(vn + 32UL) + 0UL * 8UL) = vn; + return vn; +} +u znfa_lit(u vc, u vch) { + u vn = 0; + u v3 = 0; + u v4 = 0; + u v5 = 0; + u v6 = 0; + u v7 = 0; + u v8 = 0; + u v9 = 0; + u v10 = 0; + v3 = (u)zalloc; + v4 = *(u*)(vc + 0UL); + v5 = 48UL; + v6 = ((u(*)())v3)(v4, v5); + vn = v6; + *(u*)(vn + 16UL) = 0UL; + *(u*)(vn + 8UL) = vch; + v7 = (u)zalloc; + v8 = *(u*)(vc + 0UL); + v9 = 8UL; + v10 = ((u(*)())v7)(v8, v9); + *(u*)(vn + 32UL) = v10; + *(u*)(vn + 40UL) = 1UL; + *(u*)(*(u*)(vn + 32UL) + 0UL * 8UL) = vn; + return vn; +} +u znfa_plus(u vc, u va) { + u vb = 0; + u v3 = 0; + u v4 = 0; + u v5 = 0; + u v6 = 0; + u v7 = 0; + u v8 = 0; + u v9 = 0; + u v10 = 0; + u v11 = 0; + u v12 = 0; + u v13 = 0; + u v14 = 0; + u v15 = 0; + u v16 = 0; + if (!va) goto b5; + v3 = 0UL; +b6: if (!v3) goto b3; + return 0UL; +b3: v4 = (u)zalloc; + v5 = *(u*)(vc + 0UL); + v6 = 48UL; + v7 = ((u(*)())v4)(v5, v6); + vb = v7; + *(u*)(vb + 8UL) = -1UL; + *(u*)(vb + 16UL) = 0UL; + *(u*)(vb + 24UL) = va; + v8 = (u)zalloc; + v9 = *(u*)(vc + 0UL); + v10 = 8UL; + v11 = ((u(*)())v8)(v9, v10); + *(u*)(vb + 32UL) = v11; + *(u*)(vb + 40UL) = 1UL; + *(u*)(*(u*)(vb + 32UL) + 0UL * 8UL) = vb; + v12 = (u)znfa_concat; + v13 = vc; + v14 = va; + v15 = vb; + v16 = ((u(*)())v12)(v13, v14, v15); + return v16; +b5: v3 = 1UL; + goto b6; +} +u znfa_qmark(u vc, u va) { + u vb = 0; + u vi = 0; + u v4 = 0; + u v5 = 0; + u v6 = 0; + u v7 = 0; + u v8 = 0; + u v9 = 0; + u v10 = 0; + u v11 = 0; + u v12 = 0; + u v13 = 0; + u v14 = 0; + u v15 = 0; + u v16 = 0; + u v17 = 0; + u v18 = 0; + u v19 = 0; + if (!va) goto b5; + v4 = 0UL; +b6: if (!v4) goto b3; + v5 = (u)znfa_empty; + v6 = vc; + v7 = ((u(*)())v5)(v6); + return v7; +b3: v8 = (u)zalloc; + v9 = *(u*)(vc + 0UL); + v10 = 48UL; + v11 = ((u(*)())v8)(v9, v10); + vb = v11; + *(u*)(vb + 8UL) = -1UL; + *(u*)(vb + 16UL) = 0UL; + *(u*)(vb + 24UL) = va; + *(u*)(vb + 40UL) = *(u*)(va + 40UL) + 1UL; + v12 = (u)zalloc; + v13 = *(u*)(vc + 0UL); + v14 = 8UL * *(u*)(vb + 40UL); + v15 = ((u(*)())v12)(v13, v14); + *(u*)(vb + 32UL) = v15; +b10: if (vi != *(u*)(va + 40UL)) goto b14; + *(u*)(*(u*)(vb + 32UL) + vi * 8UL) = vb; + v16 = (u)zfree; + v17 = *(u*)(vc + 0UL); + v18 = *(u*)(va + 32UL); + v19 = ((u(*)())v16)(v17, v18); + v19; + return vb; +b14: *(u*)(*(u*)(vb + 32UL) + vi * 8UL) = *(u*)(*(u*)(va + 32UL) + vi * 8UL); + vi = vi + 1UL; + goto b10; +b5: v4 = 1UL; + goto b6; +} +u znfa_star(u vc, u va) { + u v2 = 0; + u v3 = 0; + u v4 = 0; + u v5 = 0; + u v6 = 0; + u v7 = 0; + u v8 = 0; + u v9 = 0; + v2 = (u)znfa_plus; + v3 = vc; + v4 = va; + v5 = ((u(*)())v2)(v3, v4); + va = v5; + v6 = (u)znfa_qmark; + v7 = vc; + v8 = va; + v9 = ((u(*)())v6)(v7, v8); + return v9; +} u znode_to_str(u vkind) { if (vkind != 0UL) goto b3; return (u)"N_IDENT"; @@ -28314,11 +29126,11 @@ b22: zchoice(vc); if (v6 == 0UL) goto b26; goto b9; b26: zchoice(vc); - v7 = zpeg_P_lalr_grammar(vc); + v7 = zpeg_P_lexer_grammar(vc); if (v7 == 0UL) goto b30; goto b9; b30: zchoice(vc); - v8 = zpeg_P_lexer_grammar(vc); + v8 = zpeg_P_lalr_grammar(vc); if (v8 == 0UL) goto b34; goto b9; b34: zfail(vc); @@ -28523,13 +29335,13 @@ b5: zleave(vc, 90UL); } u zpeg_P_lalr_alternative(u vc) { u v1 = 0; - zenter(vc, 110UL); + zenter(vc, 120UL); b3: zchoice(vc); v1 = zpeg_P_lalr_suffix(vc); if (v1 == 0UL) goto b4; zcommit(vc); goto b3; -b4: zleave(vc, 110UL); +b4: zleave(vc, 120UL); return 1UL; } u zpeg_P_lalr_grammar(u vc) { @@ -28541,7 +29353,7 @@ u zpeg_P_lalr_grammar(u vc) { u v6 = 0; u v7 = 0; u v8 = 0; - zenter(vc, 113UL); + zenter(vc, 123UL); v1 = zliteral(vc, (u)"lalr"); if (v1 == 0UL) goto b1; v2 = zpeg_P_sp(vc); @@ -28561,7 +29373,7 @@ b14: v7 = zliteral(vc, (u)"}"); if (v7 == 0UL) goto b1; v8 = zpeg_P_sp(vc); if (v8 == 0UL) goto b1; - zleave(vc, 113UL); + zleave(vc, 123UL); return 1UL; b1: zfail(vc); return 0UL; @@ -28570,12 +29382,12 @@ u zpeg_P_lalr_op(u vc) { u v1 = 0; u v2 = 0; u v3 = 0; - zenter(vc, 108UL); + zenter(vc, 118UL); zchoice(vc); v1 = zliteral(vc, (u)"*"); if (v1 == 0UL) goto b5; b4: zcommit(vc); - zleave(vc, 108UL); + zleave(vc, 118UL); return 1UL; b5: zchoice(vc); v2 = zliteral(vc, (u)"+"); @@ -28594,7 +29406,7 @@ u zpeg_P_lalr_pattern(u vc) { u v2 = 0; u v3 = 0; u v4 = 0; - zenter(vc, 111UL); + zenter(vc, 121UL); v1 = zpeg_P_lalr_alternative(vc); if (v1 == 0UL) goto b1; b5: zchoice(vc); @@ -28606,7 +29418,7 @@ b5: zchoice(vc); if (v4 == 0UL) goto b6; zcommit(vc); goto b5; -b6: zleave(vc, 111UL); +b6: zleave(vc, 121UL); return 1UL; b1: zfail(vc); return 0UL; @@ -28619,7 +29431,7 @@ u zpeg_P_lalr_primary(u vc) { u v5 = 0; u v6 = 0; u v7 = 0; - zenter(vc, 107UL); + zenter(vc, 117UL); zchoice(vc); v1 = zliteral(vc, (u)"("); if (v1 == 0UL) goto b5; @@ -28632,7 +29444,7 @@ u zpeg_P_lalr_primary(u vc) { v5 = zpeg_P_sp(vc); if (v5 == 0UL) goto b5; b4: zcommit(vc); - zleave(vc, 107UL); + zleave(vc, 117UL); return 1UL; b5: zchoice(vc); v6 = zpeg_P_ident(vc); @@ -28652,7 +29464,7 @@ u zpeg_P_lalr_rule(u vc) { u v5 = 0; u v6 = 0; u v7 = 0; - zenter(vc, 112UL); + zenter(vc, 122UL); v1 = zpeg_P_ident(vc); if (v1 == 0UL) goto b1; v2 = zpeg_P_sp(vc); @@ -28667,7 +29479,7 @@ u zpeg_P_lalr_rule(u vc) { if (v6 == 0UL) goto b1; v7 = zpeg_P_sp(vc); if (v7 == 0UL) goto b1; - zleave(vc, 112UL); + zleave(vc, 122UL); return 1UL; b1: zfail(vc); return 0UL; @@ -28676,7 +29488,7 @@ u zpeg_P_lalr_suffix(u vc) { u v1 = 0; u v2 = 0; u v3 = 0; - zenter(vc, 109UL); + zenter(vc, 119UL); v1 = zpeg_P_lalr_primary(vc); if (v1 == 0UL) goto b1; b5: zchoice(vc); @@ -28686,7 +29498,7 @@ b5: zchoice(vc); if (v3 == 0UL) goto b6; zcommit(vc); goto b5; -b6: zleave(vc, 109UL); +b6: zleave(vc, 119UL); return 1UL; b1: zfail(vc); return 0UL; @@ -28719,13 +29531,13 @@ b5: zleave(vc, 89UL); } u zpeg_P_lexer_alternative(u vc) { u v1 = 0; - zenter(vc, 119UL); + zenter(vc, 113UL); b3: zchoice(vc); v1 = zpeg_P_lexer_suffix(vc); if (v1 == 0UL) goto b4; zcommit(vc); goto b3; -b4: zleave(vc, 119UL); +b4: zleave(vc, 113UL); return 1UL; } u zpeg_P_lexer_charset(u vc) { @@ -28736,7 +29548,7 @@ u zpeg_P_lexer_charset(u vc) { u v5 = 0; u v6 = 0; u v7 = 0; - zenter(vc, 116UL); + zenter(vc, 110UL); v1 = zliteral(vc, (u)"[["); if (v1 == 0UL) goto b1; b5: zchoice(vc); @@ -28757,7 +29569,7 @@ b9: zcommit(vc); b26: zfail(vc); v7 = zliteral(vc, (u)"]]"); if (v7 == 0UL) goto b1; - zleave(vc, 116UL); + zleave(vc, 110UL); return 1UL; b1: zfail(vc); return 0UL; @@ -28773,10 +29585,10 @@ b18: v4 = zany(vc); } u zpeg_P_lexer_dot(u vc) { u v1 = 0; - zenter(vc, 114UL); + zenter(vc, 107UL); v1 = zliteral(vc, (u)"."); if (v1 == 0UL) goto b1; - zleave(vc, 114UL); + zleave(vc, 107UL); return 1UL; b1: zfail(vc); return 0UL; @@ -28790,7 +29602,7 @@ u zpeg_P_lexer_grammar(u vc) { u v6 = 0; u v7 = 0; u v8 = 0; - zenter(vc, 122UL); + zenter(vc, 116UL); v1 = zliteral(vc, (u)"lexer"); if (v1 == 0UL) goto b1; v2 = zpeg_P_sp(vc); @@ -28810,7 +29622,7 @@ b14: v7 = zliteral(vc, (u)"}"); if (v7 == 0UL) goto b1; v8 = zpeg_P_sp(vc); if (v8 == 0UL) goto b1; - zleave(vc, 122UL); + zleave(vc, 116UL); return 1UL; b1: zfail(vc); return 0UL; @@ -28819,12 +29631,12 @@ u zpeg_P_lexer_op(u vc) { u v1 = 0; u v2 = 0; u v3 = 0; - zenter(vc, 115UL); + zenter(vc, 108UL); zchoice(vc); v1 = zliteral(vc, (u)"*"); if (v1 == 0UL) goto b5; b4: zcommit(vc); - zleave(vc, 115UL); + zleave(vc, 108UL); return 1UL; b5: zchoice(vc); v2 = zliteral(vc, (u)"+"); @@ -28843,7 +29655,7 @@ u zpeg_P_lexer_pattern(u vc) { u v2 = 0; u v3 = 0; u v4 = 0; - zenter(vc, 120UL); + zenter(vc, 114UL); v1 = zpeg_P_lexer_alternative(vc); if (v1 == 0UL) goto b1; b5: zchoice(vc); @@ -28855,7 +29667,7 @@ b5: zchoice(vc); if (v4 == 0UL) goto b6; zcommit(vc); goto b5; -b6: zleave(vc, 120UL); +b6: zleave(vc, 114UL); return 1UL; b1: zfail(vc); return 0UL; @@ -28872,7 +29684,7 @@ u zpeg_P_lexer_primary(u vc) { u v9 = 0; u v10 = 0; u v11 = 0; - zenter(vc, 117UL); + zenter(vc, 111UL); zchoice(vc); v1 = zliteral(vc, (u)"("); if (v1 == 0UL) goto b5; @@ -28885,7 +29697,7 @@ u zpeg_P_lexer_primary(u vc) { v5 = zpeg_P_sp(vc); if (v5 == 0UL) goto b5; b4: zcommit(vc); - zleave(vc, 117UL); + zleave(vc, 111UL); return 1UL; b5: zchoice(vc); v6 = zpeg_P_lexer_dot(vc); @@ -28894,7 +29706,7 @@ b5: zchoice(vc); if (v7 == 0UL) goto b17; goto b4; b17: zchoice(vc); - v8 = zpeg_P_str(vc); + v8 = zpeg_P_lexer_str(vc); if (v8 == 0UL) goto b23; v9 = zpeg_P_sp(vc); if (v9 == 0UL) goto b23; @@ -28917,7 +29729,7 @@ u zpeg_P_lexer_rule(u vc) { u v5 = 0; u v6 = 0; u v7 = 0; - zenter(vc, 121UL); + zenter(vc, 115UL); v1 = zpeg_P_ident(vc); if (v1 == 0UL) goto b1; v2 = zpeg_P_sp(vc); @@ -28932,7 +29744,17 @@ u zpeg_P_lexer_rule(u vc) { if (v6 == 0UL) goto b1; v7 = zpeg_P_sp(vc); if (v7 == 0UL) goto b1; - zleave(vc, 121UL); + zleave(vc, 115UL); + return 1UL; +b1: zfail(vc); + return 0UL; +} +u zpeg_P_lexer_str(u vc) { + u v1 = 0; + zenter(vc, 109UL); + v1 = zpeg_P_str(vc); + if (v1 == 0UL) goto b1; + zleave(vc, 109UL); return 1UL; b1: zfail(vc); return 0UL; @@ -28941,7 +29763,7 @@ u zpeg_P_lexer_suffix(u vc) { u v1 = 0; u v2 = 0; u v3 = 0; - zenter(vc, 118UL); + zenter(vc, 112UL); v1 = zpeg_P_lexer_primary(vc); if (v1 == 0UL) goto b1; b5: zchoice(vc); @@ -28951,7 +29773,7 @@ b5: zchoice(vc); if (v3 == 0UL) goto b6; zcommit(vc); goto b5; -b6: zleave(vc, 118UL); +b6: zleave(vc, 112UL); return 1UL; b1: zfail(vc); return 0UL; @@ -31326,6 +32148,10 @@ u zreconstruct(u vc, u vpn) { u v47 = 0; u v48 = 0; u v49 = 0; + u v50 = 0; + u v51 = 0; + u v52 = 0; + u v53 = 0; v6 = (u)zassert; v7 = (u)(*(u*)(vpn + 0UL) == 0UL); v8 = (u)"grammar"; @@ -31344,17 +32170,17 @@ b6: if (*(u*)(vpn + 0UL) != 2UL) goto b12; v13 = vpn; v14 = ((u(*)())v11)(v12, v13); vn = v14; -b10: v41 = (u)zmknode1; - v42 = vc; - v43 = 14UL; - v44 = vn; - v45 = ((u(*)())v41)(v42, v43, v44); - vp = v45; - v46 = (u)zcopypos; - v47 = vp; - v48 = vpn; - v49 = ((u(*)())v46)(v47, v48); - v49; +b10: v45 = (u)zmknode1; + v46 = vc; + v47 = 14UL; + v48 = vn; + v49 = ((u(*)())v45)(v46, v47, v48); + vp = v49; + v50 = (u)zcopypos; + v51 = vp; + v52 = vpn; + v53 = ((u(*)())v50)(v51, v52); + v53; *(u*)vlink = vp; vlink = vp + 16UL; vpn = *(u*)(vpn + 8UL); @@ -31384,29 +32210,33 @@ b21: if (*(u*)(vpn + 0UL) != 93UL) goto b24; v27 = (u)zpeg_compile; v28 = *(u*)(vc + 16UL); v29 = vpn; - v30 = *(u*)(vc + 24UL); + v30 = *(u*)(vc + 32UL); v31 = ((u(*)())v27)(v28, v29, v30); v31; vpn = *(u*)(vpn + 8UL); goto b2; -b24: if (*(u*)(vpn + 0UL) != 122UL) goto b27; - v32 = (u)zdie; - v33 = (u)"lexer"; - v34 = ((u(*)())v32)(v33); - v34; +b24: if (*(u*)(vpn + 0UL) != 116UL) goto b27; + v32 = (u)zlexer_compile; + v33 = *(u*)(vc + 24UL); + v34 = vpn; + v35 = *(u*)(vc + 32UL); + v36 = ((u(*)())v32)(v33, v34, v35); + v36; vpn = *(u*)(vpn + 8UL); goto b2; -b27: if (*(u*)(vpn + 0UL) != 113UL) goto b30; - v35 = (u)zdie; - v36 = (u)"lalr"; - v37 = ((u(*)())v35)(v36); - v37; +b27: if (*(u*)(vpn + 0UL) != 123UL) goto b30; + v37 = (u)zlalr_compiler; + v38 = *(u*)(vc + 24UL); + v39 = vpn; + v40 = *(u*)(vc + 32UL); + v41 = ((u(*)())v37)(v38, v39, v40); + v41; vpn = *(u*)(vpn + 8UL); goto b2; -b30: v38 = (u)zdie; - v39 = (u)"invalid decl"; - v40 = ((u(*)())v38)(v39); - v40; +b30: v42 = (u)zdie; + v43 = (u)"invalid decl"; + v44 = ((u(*)())v42)(v43); + v44; goto b10; b8: v10 = 1UL; goto b9; @@ -34311,7 +35141,7 @@ u zsetup_parser(u vcc, u verr) { u v17 = 0; v3 = (u)zalloc; v4 = *(u*)(vcc + 0UL); - v5 = 32UL; + v5 = 40UL; v6 = ((u(*)())v3)(v4, v5); vc = v6; *(u*)(vc + 0UL) = *(u*)(vcc + 0UL); @@ -34328,7 +35158,8 @@ u zsetup_parser(u vcc, u verr) { v16 = vcc; v17 = ((u(*)())v15)(v16); *(u*)(vc + 16UL) = v17; - *(u*)(vc + 24UL) = verr; + *(u*)(vc + 32UL) = verr; + *(u*)(vc + 24UL) = vcc; return vc; } u zsetup_peg(u vcc) { @@ -40595,41 +41426,49 @@ b18: if (vch != 39UL) goto b20; return 39UL; b20: if (vch != 34UL) goto b22; return 34UL; -b22: if (vch != 120UL) goto b24; - if ((s)*(u*)vi < (s)vlen) goto b27; +b22: if (vch != 91UL) goto b24; + return 91UL; +b24: if (vch != 93UL) goto b26; + return 93UL; +b26: if (vch != 45UL) goto b28; + return 45UL; +b28: if (vch != 94UL) goto b30; + return 94UL; +b30: if (vch != 120UL) goto b32; + if ((s)*(u*)vi < (s)vlen) goto b35; *(u*)vok = 0UL; return 0UL; -b27: vch = (u)*(b*)(vs + *(u*)vi * 1UL); +b35: vch = (u)*(b*)(vs + *(u*)vi * 1UL); *(u*)vi = *(u*)vi + 1UL; v6 = (u)zhexdig; v7 = vch; v8 = vok; v9 = ((u(*)())v6)(v7, v8); vhex = v9 * 16UL; - if (!*(u*)vok) goto b33; + if (!*(u*)vok) goto b41; v10 = 0UL; -b34: if (!v10) goto b31; +b42: if (!v10) goto b39; return 0UL; -b31: if ((s)*(u*)vi < (s)vlen) goto b37; +b39: if ((s)*(u*)vi < (s)vlen) goto b45; *(u*)vok = 0UL; return 0UL; -b37: vch = (u)*(b*)(vs + *(u*)vi * 1UL); +b45: vch = (u)*(b*)(vs + *(u*)vi * 1UL); *(u*)vi = *(u*)vi + 1UL; v11 = (u)zhexdig; v12 = vch; v13 = vok; v14 = ((u(*)())v11)(v12, v13); vhex = vhex | v14; - if (!*(u*)vok) goto b43; + if (!*(u*)vok) goto b51; v15 = 0UL; -b44: if (!v15) goto b41; +b52: if (!v15) goto b49; return 0UL; -b41: return vhex; -b43: v15 = 1UL; - goto b44; -b33: v10 = 1UL; - goto b34; -b24: *(u*)vok = 0UL; +b49: return vhex; +b51: v15 = 1UL; + goto b52; +b41: v10 = 1UL; + goto b42; +b32: *(u*)vok = 0UL; return 0UL; } u zunify(u vc, u va, u vb) { diff --git a/cc3.om b/cc3.om @@ -1,5 +1,5 @@ peg_grammar { - grammar = sp (enum_decl / struct_decl / union_decl / func_decl / peg_grammar / lalr_grammar / lexer_grammar)* !.; + grammar = sp (enum_decl / struct_decl / union_decl / func_decl / peg_grammar / lexer_grammar / lalr_grammar)* !.; enum_item = ident sp ("=" sp expr)?; enum_decl = enum sp "{" sp (enum_item ("," sp enum_item)*)? ("," sp)? "}" sp; @@ -149,21 +149,22 @@ peg_grammar { peg_call = peg_identifier !(sp "="); peg_identifier = [[a-zA-Z0-9_]]+; - lalr_primary = "(" sp lalr_pattern ")" sp / ident sp; - lalr_op = "*" / "+" / "?"; - lalr_suffix = lalr_primary (lalr_op sp)*; - lalr_alternative = lalr_suffix*; - lalr_pattern = lalr_alternative ("|" sp lalr_alternative)*; - lalr_rule = ident sp "=" sp lalr_pattern ";" sp; - lalr_grammar = "lalr" sp "{" sp lalr_rule+ "}" sp; - lexer_dot = "."; lexer_op = "*" / "+" / "?"; + lexer_str = str; lexer_charset = "[[" ( !"]" !"\\" . / "\\" . )* "]]"; - lexer_primary = "(" sp lexer_pattern ")" sp / lexer_dot sp / str sp / lexer_charset sp; + lexer_primary = "(" sp lexer_pattern ")" sp / lexer_dot sp / lexer_str sp / lexer_charset sp; lexer_suffix = lexer_primary (lexer_op sp)*; lexer_alternative = lexer_suffix*; lexer_pattern = lexer_alternative ("|" sp lexer_alternative)*; lexer_rule = ident sp "=" sp lexer_pattern ";" sp; lexer_grammar = "lexer" sp "{" sp lexer_rule+ "}" sp; + + lalr_primary = "(" sp lalr_pattern ")" sp / ident sp; + lalr_op = "*" / "+" / "?"; + lalr_suffix = lalr_primary (lalr_op sp)*; + lalr_alternative = lalr_suffix*; + lalr_pattern = lalr_alternative ("|" sp lalr_alternative)*; + lalr_rule = ident sp "=" sp lalr_pattern ";" sp; + lalr_grammar = "lalr" sp "{" sp lalr_rule+ "}" sp; } diff --git a/lexer.om b/lexer.om @@ -1,3 +1,415 @@ +// https://swtch.com/~rsc/regexp/regexp1.html + +struct nfa { + tag: int; + ch: int; + out: *nfa; + alt: *nfa; + tail: **nfa; + ntail: int; +} + +func nfa_concat(c: *compiler, a: *nfa, b:* nfa): *nfa { + var i: int; + + if !a { + return nil; + } + + if !b { + return nil; + } + + i = 0; + loop { + if i == a.ntail { + break; + } + + a.tail[i].out = b; + + i = i + 1; + } + + free(c.a, a.tail as *byte); + + a.tail = b.tail; + a.ntail = b.ntail; + + return a; +} + +func nfa_alt(c: *compiler, a: *nfa, b: *nfa): *nfa { + var ret: *nfa; + var n: *nfa; + var tail: **nfa; + var i: int; + var j: int; + + if !a { + return b; + } + + if !b { + return a; + } + + n = alloc(c.a, sizeof(*n)) as *nfa; + n.ch = -1; + n.out = a; + n.alt = b; + n.ntail = a.ntail + b.ntail; + n.tail = alloc(c.a, sizeof(*tail) * n.ntail) as **nfa; + + i = 0; + j = 0; + loop { + if i == a.ntail { + break; + } + + n.tail[j] = a.tail[i]; + + j = j + 1; + i = i + 1; + } + + loop { + if i == b.ntail { + break; + } + + n.tail[j] = b.tail[i]; + + j = j + 1; + i = i + 1; + } + + free(c.a, a.tail as *byte); + free(c.a, b.tail as *byte); + + return ret; +} + +func nfa_plus(c: *compiler, a: *nfa): *nfa { + var b: *nfa; + if !a { + return nil; + } + b = alloc(c.a, sizeof(*b)) as *nfa; + b.ch = -1; + b.out = nil; + b.alt = a; + b.tail = alloc(c.a, sizeof(*b.tail)) as **nfa; + b.ntail = 1; + b.tail[0] = b; + return nfa_concat(c, a, b); +} + +func nfa_qmark(c: *compiler, a: *nfa): *nfa { + var b: *nfa; + var i: int; + if !a { + return nfa_empty(c); + } + b = alloc(c.a, sizeof(*b)) as *nfa; + b.ch = -1; + b.out = nil; + b.alt = a; + b.ntail = a.ntail + 1; + b.tail = alloc(c.a, sizeof(*b.tail) * b.ntail) as **nfa; + loop { + if i == a.ntail { + break; + } + b.tail[i] = a.tail[i]; + i = i + 1; + } + b.tail[i] = b; + free(c.a, a.tail as *byte); + return b; +} + +func nfa_star(c: *compiler, a: *nfa): *nfa { + a = nfa_plus(c, a); + return nfa_qmark(c, a); +} + +func nfa_lit(c: *compiler, ch: int): *nfa { + var n: *nfa; + n = alloc(c.a, sizeof(*n)) as *nfa; + n.out = nil; + n.ch = ch; + n.tail = alloc(c.a, sizeof(*n.tail)) as **nfa; + n.ntail = 1; + n.tail[0] = n; + return n; +} + +func nfa_empty(c: *compiler): *nfa { + var n: *nfa; + n = alloc(c.a, sizeof(*n)) as *nfa; + n.ch = -1; + n.out = nil; + n.alt = nil; + n.tail = alloc(c.a, sizeof(*n.tail)) as **nfa; + n.ntail = 1; + n.tail[0] = n; + return n; +} + +func nfa_any(c: *compiler): *nfa { + return nfa_lit(c, -1); +} + func lexer_compile(c: *compiler, pn: *peg_node, err: *file) { - die("lexer"); + var a: *nfa; + var b: *nfa; + + pn = pn.child; + loop { + if !pn { + break; + } + + b = lexer_compile_rule(c, pn); + a = nfa_alt(c, a, b); + + pn = pn.next; + } +} + +func lexer_compile_rule(c: *compiler, pn: *peg_node): *nfa { + var ident: *peg_node; + var pat: *peg_node; + var a: *nfa; + var b: *nfa; + + ident = pn.child; + pat = ident.next; + + a = lexer_compile_pattern(c, pat); + b = nfa_empty(c); + b.tag = 1; + + return nfa_concat(c, a, b); +} + +func lexer_compile_pattern(c: *compiler, pn: *peg_node): *nfa { + var alt: *peg_node; + var a: *nfa; + var b: *nfa; + + alt = pn.child; + a = nil; + loop { + if !alt { + break; + } + + b = lexer_compile_alternative(c, alt); + + a = nfa_alt(c, a, b); + + alt = alt.next; + } + + return a; +} + +func lexer_compile_alternative(c: *compiler, pn: *peg_node): *nfa { + var part: *peg_node; + var a: *nfa; + var b: *nfa; + + part = pn.child; + a = nfa_empty(c); + loop { + if !part { + break; + } + + b = lexer_compile_suffix(c, part); + + a = nfa_concat(c, a, b); + + part = part.next; + } + + return a; +} + +func lexer_compile_suffix(c: *compiler, pn: *peg_node): *nfa { + var primary: *peg_node; + var op: *peg_node; + var n: *nfa; + var ch: int; + var zero: int; + var more: int; + + primary = pn.child; + + n = lexer_compile_primary(c, primary); + + zero = 0; + more = 0; + + op = primary.next; + loop { + if !op { + break; + } + + ch = op.str[0] as int; + + if ch == '*' { + zero = 1; + more = 1; + } else if ch == '+' { + more = 1; + } else if ch == '?' { + zero = 1; + } + + op = op.next; + } + + if zero && more { + return nfa_star(c, n); + } else if zero { + return nfa_qmark(c, n); + } else if more { + return nfa_plus(c, n); + } else { + return n; + } +} + +func lexer_compile_primary(c: *compiler, pn: *peg_node): *nfa { + var tag: int; + + pn = pn.child; + tag = pn.tag; + + if tag == P_lexer_pattern { + return lexer_compile_pattern(c, pn); + } else if tag == P_lexer_dot { + return lexer_compile_any(c, pn); + } else if tag == P_lexer_str { + return lexer_compile_str(c, pn); + } else if tag == P_lexer_charset { + return lexer_compile_charset(c, pn); + } else { + die("invalid lexer_primary"); + return nil; + } +} + +func lexer_compile_any(c: *compiler, pn: *peg_node): *nfa { + return nfa_any(c); +} + +func lexer_compile_str(c: *compiler, pn: *peg_node): *nfa { + var i: int; + var len: int; + var ok: int; + var ch: int; + var a: *nfa; + var b: *nfa; + + i = 1; + len = pn.len - 1; + a = nfa_empty(c); + loop { + if i == len { + break; + } + ch = unescape(pn.str, &i, len, &ok); + if !ok { + die("lexer_compile_str: invalid escape"); + } + b = nfa_lit(c, ch); + a = nfa_concat(c, a, b); + } + return a; +} + +func lexer_compile_charset(c: *compiler, pn: *peg_node): *nfa { + var i: int; + var j: int; + var len: int; + var ch: int; + var x: int; + var ok: int; + var s: *byte; + var scratch: *byte; + var start: int; + var end: int; + var a: *nfa; + var b: *nfa; + + scratch = alloc(c.a, 256); + + s = pn.str; + i = 2; + len = pn.len - 2; + x = 1; + loop { + if i == len { + break; + } + + ch = s[i] as int; + if ch == '\\' { + ch = unescape(s, &i, len, &ok); + if !ok { + die("lexer_compile_charset: invalid escape"); + } + } else if ch == '^' { + x = 0; + i = i + 1; + continue; + } else { + i = i + 1; + } + + start = ch; + end = ch; + + if i != len && (s[i] as int) == '-' { + i = i + 1; + ch = unescape(s, &i, len, &ok); + if !ok { + die("lexer_compile_charset: invalid escape"); + } + end = ch; + } + + j = start; + loop { + if j > end { + break; + } + + scratch[j] = x as byte; + + j = j + 1; + } + } + + j = 0; + loop { + if j == 256 { + break; + } + if scratch[j] { + b = nfa_lit(c, j); + a = nfa_alt(c, a, b); + } + j = j + 1; + } + + free(c.a, scratch); + + return a; } diff --git a/lib.om b/lib.om @@ -463,6 +463,14 @@ func unescape(s: *byte, i: *int, len: int, ok: *int): int { return '\''; } else if ch == '\"' { return '\"'; + } else if ch == '[' { + return '['; + } else if ch == ']' { + return ']'; + } else if ch == '-' { + return '-'; + } else if ch == '^' { + return '^'; } else if ch == 'x' { if *i >= len { *ok = 0;