lexer.om (23007B)
1 // https://swtch.com/~rsc/regexp/regexp1.html 2 3 struct nfa { 4 id: int; 5 tag: int; 6 start: int; 7 end: int; 8 out: *nfa; 9 alt: *nfa; 10 tail: **nfa; 11 ntail: int; 12 mark: int; 13 } 14 15 func nfa_concat(c: *compiler, a: *nfa, b:* nfa): *nfa { 16 var i: int; 17 18 if !a { 19 return nil; 20 } 21 22 if !b { 23 return nil; 24 } 25 26 i = 0; 27 loop { 28 if i == a.ntail { 29 break; 30 } 31 32 a.tail[i].out = b; 33 34 i = i + 1; 35 } 36 37 free(c.a, a.tail as *byte); 38 39 a.tail = b.tail; 40 a.ntail = b.ntail; 41 42 return a; 43 } 44 45 func nfa_alt(c: *compiler, a: *nfa, b: *nfa): *nfa { 46 var n: *nfa; 47 var tail: **nfa; 48 var i: int; 49 var j: int; 50 51 if !a { 52 return b; 53 } 54 55 if !b { 56 return a; 57 } 58 59 n = alloc(c.a, sizeof(*n)) as *nfa; 60 n.id = c.nfa_id; 61 c.nfa_id = c.nfa_id + 1; 62 n.start = -1; 63 n.end = -1; 64 n.out = a; 65 n.alt = b; 66 n.ntail = a.ntail + b.ntail; 67 n.tail = alloc(c.a, sizeof(*tail) * n.ntail) as **nfa; 68 69 i = 0; 70 j = 0; 71 loop { 72 if i == a.ntail { 73 break; 74 } 75 76 n.tail[j] = a.tail[i]; 77 78 j = j + 1; 79 i = i + 1; 80 } 81 82 i = 0; 83 loop { 84 if i == b.ntail { 85 break; 86 } 87 88 n.tail[j] = b.tail[i]; 89 90 j = j + 1; 91 i = i + 1; 92 } 93 94 free(c.a, a.tail as *byte); 95 free(c.a, b.tail as *byte); 96 97 return n; 98 } 99 100 func nfa_plus(c: *compiler, a: *nfa): *nfa { 101 var b: *nfa; 102 if !a { 103 return nil; 104 } 105 b = alloc(c.a, sizeof(*b)) as *nfa; 106 b.id = c.nfa_id; 107 c.nfa_id = c.nfa_id + 1; 108 b.start = -1; 109 b.end = -1; 110 b.out = nil; 111 b.alt = a; 112 b.tail = alloc(c.a, sizeof(*b.tail)) as **nfa; 113 b.ntail = 1; 114 b.tail[0] = b; 115 return nfa_concat(c, a, b); 116 } 117 118 func nfa_qmark(c: *compiler, a: *nfa): *nfa { 119 var b: *nfa; 120 var i: int; 121 if !a { 122 return nfa_empty(c); 123 } 124 b = alloc(c.a, sizeof(*b)) as *nfa; 125 b.id = c.nfa_id; 126 c.nfa_id = c.nfa_id + 1; 127 b.start = -1; 128 b.end = -1; 129 b.out = nil; 130 b.alt = a; 131 b.ntail = a.ntail + 1; 132 b.tail = alloc(c.a, sizeof(*b.tail) * b.ntail) as **nfa; 133 loop { 134 if i == a.ntail { 135 break; 136 } 137 b.tail[i] = a.tail[i]; 138 i = i + 1; 139 } 140 b.tail[i] = b; 141 free(c.a, a.tail as *byte); 142 return b; 143 } 144 145 func nfa_star(c: *compiler, a: *nfa): *nfa { 146 a = nfa_plus(c, a); 147 return nfa_qmark(c, a); 148 } 149 150 func nfa_lit(c: *compiler, start: int, end: int): *nfa { 151 var n: *nfa; 152 n = alloc(c.a, sizeof(*n)) as *nfa; 153 n.id = c.nfa_id; 154 c.nfa_id = c.nfa_id + 1; 155 n.out = nil; 156 n.start = start; 157 n.end = end; 158 n.tail = alloc(c.a, sizeof(*n.tail)) as **nfa; 159 n.ntail = 1; 160 n.tail[0] = n; 161 return n; 162 } 163 164 func nfa_empty(c: *compiler): *nfa { 165 var n: *nfa; 166 n = alloc(c.a, sizeof(*n)) as *nfa; 167 n.id = c.nfa_id; 168 c.nfa_id = c.nfa_id + 1; 169 n.start = -1; 170 n.end = -1; 171 n.out = nil; 172 n.alt = nil; 173 n.tail = alloc(c.a, sizeof(*n.tail)) as **nfa; 174 n.ntail = 1; 175 n.tail[0] = n; 176 return n; 177 } 178 179 func nfa_any(c: *compiler): *nfa { 180 var n: *nfa; 181 n = alloc(c.a, sizeof(*n)) as *nfa; 182 n.id = c.nfa_id; 183 c.nfa_id = c.nfa_id + 1; 184 n.out = nil; 185 n.start = 0; 186 n.end = 255; 187 n.tail = alloc(c.a, sizeof(*n.tail)) as **nfa; 188 n.ntail = 1; 189 n.tail[0] = n; 190 return n; 191 } 192 193 func lexer_compile(c: *compiler, n: *node, err: *file) { 194 var a: *nfa; 195 var b: *nfa; 196 var d: *dfa; 197 198 n = n.a; 199 loop { 200 if !n { 201 break; 202 } 203 204 b = lexer_compile_rule(c, n.a); 205 a = nfa_alt(c, a, b); 206 207 n = n.b; 208 } 209 210 d = lexer_explode(c, a); 211 212 lexer_codegen(c, d); 213 } 214 215 func lexer_compile_rule(c: *compiler, n: *node): *nfa { 216 var pat: *node; 217 var act: *node; 218 var a: *nfa; 219 var b: *nfa; 220 var s: *byte; 221 222 pat = n.a; 223 act = n.b; 224 225 a = lexer_compile_pattern(c, pat); 226 b = nfa_empty(c); 227 b.tag = lexer_compile_action(c, act); 228 229 return nfa_concat(c, a, b); 230 } 231 232 func lexer_compile_action(c: *compiler, n: *node): int { 233 var a: *node; 234 var b: *node; 235 var d: *node; 236 var t: *node; 237 var id: int; 238 var s: *byte; 239 var func_type: *type; 240 var v: *irop; 241 var o: *irop; 242 243 ensure_arr(c.a, (&c.lex_action) as **void, &c.lex_action_cap, c.lex_action_len + 1, sizeof(*c.lex_action)); 244 id = c.lex_action_len; 245 246 s = alloc(c.a, 64); 247 memcpy(s, "lexer_action_", 13); 248 int2str(&s[13], id); 249 250 c.lex_action[id] = s; 251 c.lex_action_len = c.lex_action_len + 1; 252 253 b = mknode(c.p, N_IDENT, nil, nil); 254 b.s = "ctx"; 255 a = mknode(c.p, N_IDENT, nil, nil); 256 a.s = "lex"; 257 a = mknode(c.p, N_PTRTYPE, a, nil); 258 a = mknode(c.p, N_ARGDECL, b, a); 259 a = mknode(c.p, N_ARGLIST, a, nil); 260 b = mknode(c.p, N_IDENT, nil, nil); 261 b.s = "int"; 262 a = mknode(c.p, N_FUNCTYPE, a, b); 263 func_type = prototype(c, a); 264 b = mknode(c.p, N_IDENT, nil, nil); 265 b.s = s; 266 a = mknode(c.p, N_FUNCDECL, b, a); 267 268 b = mknode(c.p, N_NUM, nil, nil); 269 b.n = 0; 270 b = mknode(c.p, N_RETURN, b, nil); 271 t = mknode(c.p, N_STMTLIST, b, t); 272 273 d = mknode(c.p, N_IDENT, nil, nil); 274 d.s = "ctx"; 275 d = mknode(c.p, N_EXPRLIST, d, nil); 276 b = mknode(c.p, N_IDENT, nil, nil); 277 b.s = "lex_skip"; 278 b = mknode(c.p, N_CALL, b, d); 279 t = mknode(c.p, N_STMTLIST, b, t); 280 281 t = mknode(c.p, N_STMTLIST, n, t); 282 283 a = mknode(c.p, N_FUNC, a, t); 284 285 defun(c, a); 286 287 return id + 1; 288 } 289 290 func lexer_compile_pattern(c: *compiler, n: *node): *nfa { 291 var alt: *node; 292 var a: *nfa; 293 var b: *nfa; 294 var kind: int; 295 296 kind = n.kind; 297 if kind == N_STR { 298 return lexer_compile_str(c, n); 299 } else if kind == N_LEXDOT { 300 return nfa_any(c); 301 } else if kind == N_CHARSET { 302 return lexer_compile_charset(c, n); 303 } else if kind == N_LEXSTAR { 304 a = lexer_compile_pattern(c, n.a); 305 return nfa_star(c, a); 306 } else if kind == N_LEXPLUS { 307 a = lexer_compile_pattern(c, n.a); 308 return nfa_plus(c, a); 309 } else if kind == N_LEXQMARK { 310 a = lexer_compile_pattern(c, n.a); 311 return nfa_qmark(c, a); 312 } else if kind == N_LEXCONCAT { 313 a = lexer_compile_pattern(c, n.a); 314 b = lexer_compile_pattern(c, n.b); 315 return nfa_concat(c, a, b); 316 } else if kind == N_LEXALT { 317 a = lexer_compile_pattern(c, n.a); 318 b = lexer_compile_pattern(c, n.b); 319 return nfa_alt(c, a, b); 320 } else { 321 die("unknown lex node"); 322 return nil; 323 } 324 } 325 326 func lexer_compile_str(c: *compiler, n: *node): *nfa { 327 var i: int; 328 var len: int; 329 var ok: int; 330 var ch: int; 331 var a: *nfa; 332 var b: *nfa; 333 334 i = 0; 335 a = nfa_empty(c); 336 loop { 337 ch = n.s[i] as int; 338 if ch == 0 { 339 break; 340 } 341 b = nfa_lit(c, ch, ch); 342 a = nfa_concat(c, a, b); 343 i = i + 1; 344 } 345 return a; 346 } 347 348 func parse_charset(a: *alloc, s: *byte, len: int): *byte { 349 var i: int; 350 var j: int; 351 var ch: int; 352 var x: int; 353 var ok: int; 354 var scratch: *byte; 355 var start: int; 356 var end: int; 357 358 scratch = alloc(a, 256); 359 360 i = 0; 361 x = 1; 362 363 if s[i] == '^' as byte { 364 x = 0; 365 j = 0; 366 loop { 367 if j == 256 { 368 break; 369 } 370 scratch[j] = 1 as byte; 371 j = j + 1; 372 } 373 i = i + 1; 374 } 375 376 loop { 377 if i == len { 378 break; 379 } 380 381 ch = s[i] as int; 382 if ch == '\\' { 383 ch = unescape(s, &i, len, &ok); 384 if !ok { 385 die("parse_charset: invalid escape"); 386 } 387 } else if ch == '^' { 388 x = 0; 389 i = i + 1; 390 continue; 391 } else { 392 i = i + 1; 393 } 394 395 start = ch; 396 end = ch; 397 398 if i != len && (s[i] as int) == '-' { 399 i = i + 1; 400 ch = unescape(s, &i, len, &ok); 401 if !ok { 402 die("parse_charset: invalid escape"); 403 } 404 end = ch; 405 } 406 407 j = start; 408 loop { 409 if j > end { 410 break; 411 } 412 413 scratch[j] = x as byte; 414 415 j = j + 1; 416 } 417 } 418 419 return scratch; 420 } 421 422 func lexer_compile_charset(c: *compiler, n: *node): *nfa { 423 var scratch: *byte; 424 var j: int; 425 var start: int; 426 var end: int; 427 var a: *nfa; 428 var b: *nfa; 429 430 scratch = n.s; 431 432 j = 0; 433 loop { 434 if j == 256 { 435 break; 436 } 437 438 if !scratch[j] { 439 j = j + 1; 440 continue; 441 } 442 443 start = j; 444 445 loop { 446 if j == 256 || !scratch[j] { 447 break; 448 } 449 j = j + 1; 450 } 451 452 end = j - 1; 453 454 b = nfa_lit(c, start, end); 455 a = nfa_alt(c, a, b); 456 } 457 458 return a; 459 } 460 461 func nfa_show(n: *nfa) { 462 fputs(nil, "digraph {\n"); 463 nfa_show2(n); 464 fputs(nil, "}\n"); 465 } 466 467 func nfa_show2(n: *nfa) { 468 if !n { 469 return; 470 } 471 472 if n.mark { 473 return; 474 } 475 476 n.mark = 1; 477 478 fputd(nil, n.id); 479 if n.tag != 0 { 480 fputs(nil, "[shape=doublecircle]"); 481 } 482 fputs(nil, ";\n"); 483 484 if n.out { 485 fputd(nil, n.id); 486 fputs(nil, "->"); 487 fputd(nil, n.out.id); 488 if n.out.start != -1 { 489 fputs(nil, "[label=\"'"); 490 if n.out.start == '"' || n.out.start == '\\' { 491 fputs(nil, "\\"); 492 } 493 if n.out.start != 0 { 494 fputc(nil, n.out.start); 495 } else { 496 fputs(nil, "\\0"); 497 } 498 fputs(nil, "'\"]"); 499 } 500 fputs(nil, ";\n"); 501 nfa_show2(n.out); 502 } 503 504 if n.alt { 505 fputd(nil, n.id); 506 fputs(nil, "->"); 507 fputd(nil, n.alt.id); 508 if n.alt.start != -1 { 509 fputs(nil, "[label=\"'"); 510 if n.alt.start == '"' || n.alt.start == '\\' { 511 fputs(nil, "\\"); 512 } 513 if n.alt.start != 0 { 514 fputc(nil, n.alt.start); 515 } else { 516 fputs(nil, "\\0"); 517 } 518 fputs(nil, "'\"]"); 519 } 520 fputs(nil, ";\n"); 521 nfa_show2(n.alt); 522 } 523 } 524 525 struct dfa { 526 node: rbnode; 527 id: int; 528 link: **dfa; 529 key: **nfa; 530 keylen: int; 531 mark: int; 532 tag: int; 533 } 534 535 struct dfakey { 536 a: *alloc; 537 live: **nfa; 538 len: int; 539 cap: int; 540 tag: int; 541 } 542 543 func dfakey_add(k: *dfakey, n: *nfa) { 544 var tmp: **nfa; 545 546 if !n || n.mark { 547 return; 548 } 549 n.mark = 1; 550 551 if !k.tag || (n.tag && n.tag < k.tag) { 552 k.tag = n.tag; 553 } 554 555 if k.len == k.cap { 556 if k.cap == 0 { 557 k.cap = 16; 558 } else { 559 k.cap = k.cap * 2; 560 } 561 tmp = alloc(k.a, sizeof(*k.live) * k.cap) as **nfa; 562 memcpy(tmp as *byte, k.live as *byte, sizeof(*k.live) * k.len); 563 free(k.a, k.live as *byte); 564 k.live = tmp; 565 } 566 567 k.live[k.len] = n; 568 k.len = k.len + 1; 569 570 if n.out && n.out.start == -1 { 571 dfakey_add(k, n.out); 572 } 573 if n.alt && n.alt.start == -1 { 574 dfakey_add(k, n.alt); 575 } 576 } 577 578 func dfakey_clear(k: *dfakey) { 579 var i: int; 580 581 i = 0; 582 loop { 583 if i == k.len { 584 break; 585 } 586 587 k.live[i].mark = 0; 588 589 i = i + 1; 590 } 591 592 k.tag = 0; 593 k.len = 0; 594 } 595 596 func dfakey_sort(k: *dfakey) { 597 var i: int; 598 var j: int; 599 var tmp: *nfa; 600 601 i = 1; 602 loop { 603 if i >= k.len { 604 break; 605 } 606 j = i; 607 tmp = k.live[j]; 608 loop { 609 if j == 0 || k.live[j - 1].id <= tmp.id { 610 break; 611 } 612 k.live[j] = k.live[j - 1]; 613 j = j - 1; 614 } 615 k.live[j] = tmp; 616 i = i + 1; 617 } 618 } 619 620 func lexer_explode(c: *compiler, n: *nfa): *dfa { 621 var _k: dfakey; 622 var k: *dfakey; 623 var ret: *dfa; 624 625 k = &_k; 626 k.a = c.a; 627 628 if n.start != -1 { 629 die("expected epsilon"); 630 } 631 632 dfakey_add(k, n); 633 634 ret = nfa2dfa(c, k); 635 636 free(k.a, k.live as *byte); 637 638 ret = dfa_minimize(c, ret); 639 640 return ret; 641 } 642 643 func nfa2dfa(c: *compiler, k: *dfakey): *dfa { 644 var d: *dfa; 645 var p: *rbnode; 646 var link: **rbnode; 647 var dir: int; 648 var i: int; 649 var j: int; 650 651 if k.len == 0 { 652 return nil; 653 } 654 655 dfakey_sort(k); 656 657 p = nil; 658 link = &c.dfa_cache; 659 loop { 660 d = (*link) as *dfa; 661 if !d { 662 break; 663 } 664 665 dir = dfa_cmp(k, d); 666 667 if dir < 0 { 668 link = &d.node.left; 669 p = &d.node; 670 } else if dir > 0 { 671 link = &d.node.right; 672 p = &d.node; 673 } else { 674 return d; 675 } 676 } 677 678 d = alloc(c.a, sizeof(*d)) as *dfa; 679 d.link = alloc(c.a, sizeof(*d.link) * 256) as **dfa; 680 d.key = alloc(c.a, sizeof(*d.key) * k.len) as **nfa; 681 d.keylen = k.len; 682 d.id = c.dfa_id; 683 c.dfa_id = c.dfa_id + 1; 684 685 i = 0; 686 loop { 687 if i == k.len { 688 break; 689 } 690 d.key[i] = k.live[i]; 691 i = i + 1; 692 } 693 d.tag = k.tag; 694 695 rb_link((&c.dfa_cache) as **rbnode, link, p, &d.node); 696 697 i = 0; 698 loop { 699 if i == 256 { 700 break; 701 } 702 703 dfakey_clear(k); 704 j = 0; 705 loop { 706 if j == d.keylen { 707 break; 708 } 709 710 if d.key[j].out && d.key[j].out.start <= i && d.key[j].out.end >= i { 711 dfakey_add(k, d.key[j].out); 712 } 713 if d.key[j].alt && d.key[j].alt.start <= i && d.key[j].alt.end >= i { 714 dfakey_add(k, d.key[j].alt); 715 } 716 717 j = j + 1; 718 } 719 720 d.link[i] = nfa2dfa(c, k); 721 i = i + 1; 722 } 723 724 return d; 725 } 726 727 func dfa_cmp(k: *dfakey, d: *dfa): int { 728 var i: int; 729 730 i = 0; 731 loop { 732 if i == k.len && i == d.keylen { 733 break; 734 } 735 736 if i == k.len { 737 return -1; 738 } 739 740 if i == d.keylen { 741 return 1; 742 } 743 744 if k.live[i].id > d.key[i].id { 745 return 1; 746 } 747 748 if k.live[i].id < d.key[i].id { 749 return -1; 750 } 751 752 i = i + 1; 753 } 754 755 return 0; 756 } 757 758 func dfa_show(d: *dfa) { 759 fputs(nil, "digraph {\n"); 760 dfa_show2(d); 761 fputs(nil, "}\n"); 762 } 763 764 func dfa_show2(d: *dfa) { 765 var i: int; 766 var start: int; 767 var end: int; 768 var o: *dfa; 769 if !d || d.mark { 770 return; 771 } 772 d.mark = 1; 773 774 fputd(nil, d.id); 775 if d.tag != 0 { 776 fputs(nil, "[label=\""); 777 fputd(nil, d.tag); 778 fputs(nil, "\" shape=doublecircle]"); 779 } else { 780 fputs(nil, "[label=\"\"]"); 781 } 782 fputs(nil, ";\n"); 783 784 i = 0; 785 loop { 786 if i == 256 { 787 break; 788 } 789 790 if !d.link[i] { 791 i = i + 1; 792 continue; 793 } 794 795 start = i; 796 o = d.link[i]; 797 798 loop { 799 if i == 256 || d.link[i] != o { 800 break; 801 } 802 i = i + 1; 803 } 804 805 end = i - 1; 806 807 fputd(nil, d.id); 808 fputs(nil, "->"); 809 fputd(nil, o.id); 810 fputs(nil, "[label=\""); 811 if start >= ' ' && start <= '~' && start != '"' && start != '\\' { 812 fputc(nil, start); 813 } else { 814 fputs(nil, "\\\\x"); 815 fputh(nil, start >> 4); 816 fputh(nil, start & 15); 817 } 818 if start != end { 819 fputs(nil, "-"); 820 if end >= ' ' && end <= '~' && end != '"' && end != '\\' { 821 fputc(nil, end); 822 } else { 823 fputs(nil, "\\\\x"); 824 fputh(nil, end >> 4); 825 fputh(nil, end & 15); 826 } 827 } 828 fputs(nil, "\"];\n"); 829 830 dfa_show2(o); 831 } 832 } 833 834 struct dfamin { 835 a: *alloc; 836 sets: **dfaset; 837 nsets: int; 838 capsets: int; 839 queue: *dfaset; 840 } 841 842 struct dfaset { 843 next: *dfaset; 844 inqueue: int; 845 ind: **dfa; 846 ind_len: int; 847 ind_cap: int; 848 node: *dfa; 849 } 850 851 func dfa_minimize(c: *compiler, d: *dfa): *dfa { 852 var _ctx: dfamin; 853 var ctx: *dfamin; 854 ctx = &_ctx; 855 ctx.a = c.a; 856 dfamin_setup(ctx, d); 857 dfamin_refine(ctx); 858 return dfamin_extract(ctx, d); 859 } 860 861 func dfamin_setup(ctx: *dfamin, d: *dfa) { 862 var i: int; 863 var s: *dfaset; 864 865 if !d || d.mark { 866 return; 867 } 868 d.mark = d.tag + 1; 869 870 // Split states classes based on the accepting tag. 871 ensure_arr(ctx.a, (&ctx.sets) as **void, &ctx.capsets, d.tag + 1, sizeof(*ctx.sets)); 872 s = ctx.sets[d.tag]; 873 if !s { 874 s = alloc(ctx.a, sizeof(*s)) as *dfaset; 875 s.next = ctx.queue; 876 ctx.queue = s; 877 s.inqueue = 1; 878 ctx.sets[d.tag] = s; 879 if d.tag >= ctx.nsets { 880 ctx.nsets = d.tag + 1; 881 } 882 } 883 884 ensure_arr(ctx.a, (&s.ind) as **void, &s.ind_cap, s.ind_len + 1, sizeof(*s.ind)); 885 s.ind[s.ind_len] = d; 886 s.ind_len = s.ind_len + 1; 887 888 i = 0; 889 loop { 890 if i == 256 { 891 break; 892 } 893 dfamin_setup(ctx, d.link[i]); 894 i = i + 1; 895 } 896 } 897 898 func dfamin_refine(ctx: *dfamin) { 899 var s: *dfaset; 900 var i: int; 901 var p: *dfaset; 902 var ch: int; 903 loop { 904 // Remove a set s from the queue 905 s = ctx.queue; 906 if !s { 907 break; 908 } 909 ctx.queue = s.next; 910 s.inqueue = 0; 911 912 i = 0; 913 loop { 914 // For each set p in the partition 915 if i == ctx.nsets { 916 break; 917 } 918 p = ctx.sets[i]; 919 920 // For each letter of the alphabet 921 ch = 0; 922 loop { 923 if ch == 256 { 924 break; 925 } 926 dfamin_split(ctx, p, s, ch); 927 ch = ch + 1; 928 } 929 930 i = i + 1; 931 } 932 } 933 } 934 935 func dfamin_split(ctx: *dfamin, p: *dfaset, s: *dfaset, ch: int) { 936 var i: int; 937 var k: int; 938 var m: int; 939 var d: *dfa; 940 var e: *dfa; 941 var new_ind: **dfa; 942 var t: *dfaset; 943 944 // Count k the states in p that lead to s on ch 945 i = 0; 946 k = 0; 947 loop { 948 if i == p.ind_len { 949 break; 950 } 951 d = p.ind[i]; 952 953 e = d.link[ch]; 954 if e && ctx.sets[e.mark - 1] == s { 955 k = k + 1; 956 } 957 958 i = i + 1; 959 } 960 961 if k == 0 || k == p.ind_len { 962 return; 963 } 964 965 new_ind = alloc(ctx.a, sizeof(*new_ind) * (p.ind_len - k)) as **dfa; 966 967 // Split nodes into two sets 968 i = 0; 969 k = 0; 970 m = 0; 971 loop { 972 if i == p.ind_len { 973 break; 974 } 975 d = p.ind[i]; 976 977 e = d.link[ch]; 978 if e && ctx.sets[e.mark - 1] == s { 979 p.ind[k] = d; 980 k = k + 1; 981 } else { 982 new_ind[m] = d; 983 d.mark = ctx.nsets + 1; 984 m = m + 1; 985 } 986 987 i = i + 1; 988 } 989 p.ind_len = k; 990 991 // Create a new set from the nodes that don't lead to p 992 ensure_arr(ctx.a, (&ctx.sets) as **void, &ctx.capsets, ctx.nsets + 1, sizeof(*ctx.sets)); 993 t = alloc(ctx.a, sizeof(*t)) as *dfaset; 994 t.next = ctx.queue; 995 t.ind = new_ind; 996 t.ind_len = m; 997 t.ind_cap = m; 998 ctx.queue = t; 999 t.inqueue = 1; 1000 ctx.sets[ctx.nsets] = t; 1001 ctx.nsets = ctx.nsets + 1; 1002 1003 // Queue up p 1004 if !p.inqueue { 1005 p.next = ctx.queue; 1006 ctx.queue = p; 1007 p.inqueue = 1; 1008 } 1009 } 1010 1011 func dfamin_extract(ctx: *dfamin, d: *dfa): *dfa { 1012 var s: *dfaset; 1013 var g: *dfa; 1014 var ch: int; 1015 1016 if !d { 1017 return nil; 1018 } 1019 1020 s = ctx.sets[d.mark - 1]; 1021 if s.node { 1022 return s.node; 1023 } 1024 1025 g = alloc(ctx.a, sizeof(*g)) as *dfa; 1026 g.id = d.mark - 1; 1027 g.tag = d.tag; 1028 g.link = alloc(ctx.a, sizeof(*g.link) * 256) as **dfa; 1029 1030 s.node = g; 1031 1032 ch = 0; 1033 loop { 1034 if ch == 256 { 1035 break; 1036 } 1037 if d.link[ch] != nil { 1038 g.link[ch] = dfamin_extract(ctx, d.link[ch]); 1039 } 1040 ch = ch + 1; 1041 } 1042 1043 return g; 1044 } 1045 1046 struct dfa_codegen { 1047 a: *alloc; 1048 states: **dfa; 1049 states_len: int; 1050 states_cap: int; 1051 tag: **byte; 1052 tag_len: int; 1053 link: *int; 1054 link_len: int; 1055 } 1056 1057 func lexer_codegen(c: *compiler, d: *dfa) { 1058 var _cg: dfa_codegen; 1059 var cg: *dfa_codegen; 1060 var i: int; 1061 var ch: int; 1062 cg = &_cg; 1063 cg.a = c.a; 1064 1065 lexer_codegen_setup(cg, d); 1066 1067 cg.tag_len = cg.states_len * sizeof(*cg.tag); 1068 cg.tag = alloc(cg.a, cg.tag_len) as **byte; 1069 1070 cg.link_len = cg.states_len * 256 * sizeof(*cg.link); 1071 cg.link = alloc(cg.a, cg.link_len + 1) as *int; 1072 1073 i = 0; 1074 loop { 1075 if i == cg.states_len { 1076 break; 1077 } 1078 1079 d = cg.states[i]; 1080 d.mark = i; 1081 1082 i = i + 1; 1083 } 1084 1085 i = 0; 1086 loop { 1087 if i == cg.states_len { 1088 break; 1089 } 1090 1091 d = cg.states[i]; 1092 if d.tag { 1093 cg.tag[i] = c.lex_action[d.tag - 1]; 1094 } 1095 1096 ch = 0; 1097 loop { 1098 if ch == 256 { 1099 break; 1100 } 1101 1102 if d.link[ch] { 1103 cg.link[i * 256 + ch] = d.link[ch].mark; 1104 } else { 1105 cg.link[i * 256 + ch] = -1; 1106 } 1107 1108 ch = ch + 1; 1109 } 1110 1111 i = i + 1; 1112 } 1113 1114 lexer_output(c, cg); 1115 } 1116 1117 func lexer_codegen_setup(cg: *dfa_codegen, d: *dfa) { 1118 var i: int; 1119 1120 if !d || d.mark { 1121 return; 1122 } 1123 d.mark = 1; 1124 1125 ensure_arr(cg.a, (&cg.states) as **void, &cg.states_cap, cg.states_len + 1, sizeof(*cg.states)); 1126 cg.states[cg.states_len] = d; 1127 cg.states_len = cg.states_len + 1; 1128 1129 i = 0; 1130 loop { 1131 if i == 256 { 1132 break; 1133 } 1134 lexer_codegen_setup(cg, d.link[i]); 1135 i = i + 1; 1136 } 1137 } 1138 1139 func lexer_output(c: *compiler, cg: *dfa_codegen) { 1140 var ic: *irfunc; 1141 var o: *irop; 1142 var ret_type: *type; 1143 var func_type: *type; 1144 1145 ret_type = mktype0(c, TY_INT); 1146 ret_type = mktype1(c, TY_PTR, ret_type); 1147 func_type = mktype2(c, TY_FUNC, ret_type, nil); 1148 1149 ic = mkirfunc(c, "get_link_table"); 1150 o = mkirstr(ic, cg.link as *byte, cg.link_len); 1151 irreturn(ic, o); 1152 define_ir_func(c, ic, func_type); 1153 1154 ret_type = mktype0(c, TY_INT); 1155 func_type = mktype2(c, TY_FUNC, ret_type, nil); 1156 1157 ic = mkirfunc(c, "get_num_states"); 1158 o = mkirconst(ic, cg.states_len); 1159 irreturn(ic, o); 1160 define_ir_func(c, ic, func_type); 1161 1162 lexer_compile_get_tag_table(c, cg); 1163 } 1164 1165 func lexer_compile_get_tag_table(c: *compiler, cg: *dfa_codegen) { 1166 var arg1_type: *type; 1167 var args_type: *type; 1168 var ret_type: *type; 1169 var func_type: *type; 1170 var ic: *irfunc; 1171 var o: *irop; 1172 var a: *irop; 1173 var b: *irop; 1174 var this: *irblock; 1175 var next: *irblock; 1176 var i: int; 1177 var v: *irvar; 1178 1179 arg1_type = mktype0(c, TY_VOID); 1180 arg1_type = mktype1(c, TY_PTR, arg1_type); 1181 args_type = mktype1(c, TY_ARG, arg1_type); 1182 ret_type = mktype0(c, TY_INT); 1183 func_type = mktype2(c, TY_FUNC, ret_type, args_type); 1184 arg1_type = mktype1(c, TY_PTR, func_type); 1185 args_type = mktype1(c, TY_ARG, arg1_type); 1186 ret_type = mktype0(c, TY_VOID); 1187 func_type = mktype2(c, TY_FUNC, ret_type, args_type); 1188 1189 ic = mkirfunc(c, "get_tag_table"); 1190 1191 v = iraddarg(ic, "t", arg1_type); 1192 1193 i = 0; 1194 loop { 1195 if i == cg.states_len { 1196 break; 1197 } 1198 1199 if cg.tag[i] { 1200 a = mkirop(ic, IOP_VAR, nil, nil); 1201 a.n = v.n; 1202 b = mkirconst(ic, sizeof(i) * i); 1203 a = mkirop(ic, IOP_ADD, a, b); 1204 a = mkirop(ic, IOP_LOAD, a, nil); 1205 b = mkirfuncref(ic, cg.tag[i]); 1206 o = mkirop(ic, IOP_STORE, a, b); 1207 o.t = arg1_type.val; 1208 iraddop(ic, o); 1209 } 1210 1211 i = i + 1; 1212 } 1213 1214 o = mkirconst(ic, 0); 1215 irreturn(ic, o); 1216 1217 define_ir_func(c, ic, func_type); 1218 } 1219 1220 func get_num_states(): int; 1221 func get_tag_table(t: *(func(ctx: *void):int)); 1222 func get_link_table(): *int; 1223 1224 struct lex { 1225 a: *alloc; 1226 c: *compiler; 1227 1228 tag: *(func(ctx: *lex):int); 1229 link: *int; 1230 1231 fd: int; 1232 eof: int; 1233 goteof: int; 1234 1235 buf: *byte; 1236 start: int; 1237 end: int; 1238 fill: int; 1239 cap: int; 1240 1241 skip: int; 1242 1243 start_lineno: int; 1244 start_colno: int; 1245 end_lineno: int; 1246 end_colno: int; 1247 1248 _val: lex_sem; 1249 val: *lex_sem; 1250 1251 sem: *lex_sem; 1252 sem_len: int; 1253 1254 stack: *lex_sem; 1255 stack_len: int; 1256 stack_cap: int; 1257 } 1258 1259 struct lex_sem { 1260 n: *node; 1261 } 1262 1263 func setup_lex(c: *compiler): *lex { 1264 var l: *lex; 1265 var i: int; 1266 l = alloc(c.a, sizeof(*l)) as *lex; 1267 l.a = c.a; 1268 l.c = c; 1269 i = get_num_states(); 1270 l.tag = alloc(l.a, sizeof(*l.tag) * i) as *(func(ctx:*lex):int); 1271 get_tag_table(l.tag as *(func(ctx: *void):int)); 1272 l.link = get_link_table(); 1273 l.fd = 0; 1274 l.eof = 1; 1275 l.goteof = 1; 1276 l.buf = nil; 1277 l.start = 0; 1278 l.end = 0; 1279 l.fill = 0; 1280 l.cap = 0; 1281 l.start_lineno = 1; 1282 l.start_colno = 1; 1283 l.end_lineno = 1; 1284 l.end_colno = 1; 1285 l.val = &l._val; 1286 l.sem = nil; 1287 l.sem_len = 0; 1288 l.stack = nil; 1289 l.stack_len = 0; 1290 l.stack_cap = 0; 1291 ensure_arr(l.a, (&l.stack) as **void, &l.stack_cap, l.stack_len + 2, sizeof(*l.stack)); 1292 return l; 1293 } 1294 1295 func open_lex(l: *lex, fd: int) { 1296 l.fd = fd; 1297 l.eof = 0; 1298 l.goteof = 0; 1299 1300 l.start = 0; 1301 l.end = 0; 1302 l.fill = 0; 1303 1304 l.start_lineno = 1; 1305 l.start_colno = 1; 1306 1307 l.end_lineno = 1; 1308 l.end_colno = 1; 1309 } 1310 1311 func lex_skip(l: *lex) { 1312 l.skip = 1; 1313 } 1314 1315 func gettok(l: *lex): int { 1316 var state: int; 1317 var ch: int; 1318 var tag: (func(ctx:*lex):int); 1319 var ptr: int; 1320 var lineno: int; 1321 var colno: int; 1322 var ret: int; 1323 var tmp: *byte; 1324 1325 loop { 1326 loop { 1327 ptr = l.end; 1328 lineno = l.end_lineno; 1329 colno = l.end_colno; 1330 1331 l.start = ptr; 1332 l.start_lineno = lineno; 1333 l.start_colno = colno; 1334 1335 l.c.lineno = lineno; 1336 l.c.colno = colno; 1337 1338 l.end_lineno = lineno; 1339 l.end_colno = colno; 1340 1341 state = 0; 1342 tag = nil; 1343 loop { 1344 if ptr == l.fill { 1345 if l.goteof { 1346 if l.start == l.fill { 1347 l.eof = 1; 1348 return -1; 1349 } 1350 break; 1351 } 1352 1353 if l.fill - l.start >= (l.cap >> 1) { 1354 l.cap = l.cap * 2 + 16 * 1024; 1355 tmp = alloc(l.a, l.cap); 1356 memcpy(tmp, &l.buf[l.start], l.fill - l.start); 1357 free(l.a, l.buf); 1358 l.buf = tmp; 1359 ptr = ptr - l.start; 1360 l.end = l.end - l.start; 1361 l.fill = l.fill - l.start; 1362 l.start = 0; 1363 } else if l.fill >= (l.cap >> 1) { 1364 memcpy(l.buf, &l.buf[l.start], l.fill - l.start); 1365 ptr = ptr - l.start; 1366 l.end = l.end - l.start; 1367 l.fill = l.fill - l.start; 1368 l.start = 0; 1369 } 1370 1371 ret = read(l.fd, &l.buf[l.fill], l.cap - l.fill); 1372 if ret < 0 { 1373 die("read failed"); 1374 } 1375 1376 if ret == 0 { 1377 l.goteof = 1; 1378 if l.start == l.fill { 1379 l.eof = 1; 1380 return -1; 1381 } 1382 break; 1383 } 1384 1385 l.fill = l.fill + ret; 1386 } 1387 1388 ch = l.buf[ptr] as int; 1389 ptr = ptr + 1; 1390 1391 if ch == '\n' { 1392 lineno = lineno + 1; 1393 colno = 1; 1394 } else { 1395 colno = colno + 1; 1396 } 1397 1398 state = l.link[state * 256 + ch]; 1399 if state == -1 { 1400 break; 1401 } 1402 1403 if l.tag[state] { 1404 tag = l.tag[state]; 1405 l.end = ptr; 1406 l.end_lineno = lineno; 1407 l.end_colno = colno; 1408 } 1409 } 1410 1411 if !tag { 1412 die("huh"); 1413 return -1; 1414 } 1415 1416 l.skip = 0; 1417 ret = tag(l); 1418 if !l.skip { 1419 return ret; 1420 } 1421 } 1422 } 1423 } 1424 1425 func dolex(c: *compiler, name: *byte, out: *file) { 1426 var l: *lex; 1427 var fd: int; 1428 var tok: int; 1429 l = setup_lex(c); 1430 fd = open(name, 0, 0); 1431 if fd < 0 { 1432 die("open failed"); 1433 } 1434 open_lex(l, fd); 1435 loop { 1436 tok = gettok(l); 1437 if tok < 0 { 1438 die("problem"); 1439 } 1440 if tok == 0 { 1441 return; 1442 } 1443 fputs(out, "\t'"); 1444 fputb(out, &l.buf[l.start], l.end - l.start); 1445 fputs(out, "'\n"); 1446 } 1447 }