use strict; use warnings; # this is a generic nipl parser # see generic_parser.txt for details # tokenizer # TODO # - I don't like the code for numeric constants, seems not very generic # - check out how pliant does numeric constants # - allow support for other bases (e.g. binary)? # - support unicode identifiers (other letters) # - use ` ' instead of " " and ' ' for quoting strings and " for apostrophe? # - how to distinguish chars and single-char strings? prefix: c`f' ? # - should we allow ctrl-chars in strings (e.g. \0) # - should we allow vtab for a section break? #sub tokenize { # my ($text) = @_; # my @tokens; # for ($text) { # while ($_ ne "") { # # newline # s/^(\n)//s or # # tabs # s/^(\t+)//s or # # spaces - other whitespace is forbidden for now # s/^( +)//s or # # comment # s/^(#[^\n]*)//s or # # brackets # s/^([][(){}])//s or # # hex number # s/^(0x[0-9a-f]+(u|l|ul|lu)?)//si or # # oct number # s/^(0[0-7]+(u|l|ul|lu)?)//si or # # XXX deciaml int missing! # # decimal floating-point # s/^((\d*\.\d+|\d+\.)(e[-+]?\d+)?[fl]?)//si or # s/^(\d+(e[-+]?\d+)[fl]?)//si or # # quoted string # s/^(l?"([^"\\\n]|\\.)*")//si or # # quoted character - actually allows any string! # s/^(l?'([^'\\\n]|\\.)*')//si or # # identifier - textual operators will be matched here # s/^([a-z_][a-z0-9_]*)//si or # # operator - anything else, greedy # s/^([^a-z0-9_\s#"'\x0-\x1f\x7f]+)//si or # # other whitespace / ctrl-chars - forbidden # # (except in strings) # s/^([\x0-\x1f\x7f])//s && # die "invalid character 0x".sprintf("%x", ord($1)) or # # failed to match??? # die "didn't match a character???"; # my $token = $1; # push @tokens, $token; # } # } # return \@tokens; #} sub split_into_lines { my ($text) = @_; if ($text !~ /\n\z/s) { die "text does not end with a newline\n"; } my @lines = split /\n/, $text, 0; return \@lines; } # sub measure_indent { # my ($lines) = @_; # my @out; # for (@$lines) { # my $line = $_; # $line =~ s/(\t*)( *)//; # push @out, [length($1), length($2), $line]; # } # return \@out; #} # collect_block # # each output line is a list of components # each component can be a line or a block # # @$lines is a list of lines input, $block is a list of statements output # it will append to @$block if there is stuff already in it # NEED REAL SUBROUTINES!!! can't wait for nipl! sub list_reader { my ($list, $index) = @_; $index ||= 0; return sub { if ($index < @$list) { return $$list[$index++]; } else { return; } }; } sub list_writer { my ($list) = @_; $list ||= []; return sub { push @$list, @_; }, $list; } # STRUCTURED PROGRAMMING CONSIDERED HARMFUL!!! sub collect_block { my ($r, $w) = @_; my $statement = []; my $line; $line = $r->() defined $line or return; do { if ($line =~ s/^\t//) { my @subblock; do { push @subblock, $line; $line = $r->() or last; } while ($line =~ s/^\t//); push @$statement, \@subblock; } elsif ($line =~ /^ /) { if (@$statement == 0) { # this is just being pedantic - # is there any reason we _would_ want to allow # this? die "continuation line at start of block"; } push @$statement, $line; } else { if (@$statement) { $w->($statement); $statement = []; } push @$statement, $line; } } while (defined $line); if (@$statement) { $w->($statement); } } sub tokenize_statements { } # untokenize - this will return text for the code sub untokenize { my ($tokens) = @_; return join('', @$tokens); } # parser # TODO # - support cond ? true : false operator ??? # these are the types of tokens: # newline, tab, space, comment # bracket # constant: numeric (hex, oct, dec), string, char, (not boolean) # identifier # operator # add support for symbols / symbol constants? # - do we want to be able to reconstruct the original text exactly from the # parse tree? yes, if possible: at least it should be readable, neat # - leave comments, space tokens and brackets in the tree for this purpose # - they are ignored # - when we want to ignore a token, but leave it so can reconstruct the # original text, we prefix with "\x1" # - when we insert tokens to clarify things, they are prefixed with "\x2" # # - allow comments that don't start at beginning of line? # the parse tree is a list of tokens, or sub-lists. # an expression in brackets translates to a sub-list. # an indented block is equivalent to a sub-list of lists, i.e.: # if (a + b) == 10 # print a # print b # else # print b # # -> # # if (a + b) == 10 ( (print a) (print b) ) else ( (print b) ) # # -> # # if ((a + b) == 10) ( (print a) (print b) ) else ( (print b) ) # you can't continue a comment like this: # # # comments aren't continued # over newlines # # this is a probably a good thing. # perhaps it would be better not to tokenize it completely at the start. # we don't currently support multi-line string literals, but could perhaps # support reading a string literal from a file # # perhaps we could have multi-line strings like this: # print " # here is a multi-line # string. I can even indent bits of it: # hello world # foo # foo bar baz # # that seems good to me! # # don't use "." for an empty block, rely on editor to be able to highlight # tabs and spaces somehow my $ignore = "\x1"; my $hidden = "\x2"; sub split_into_lines { my ($tokens) = @_; my @lines; my $line = []; for (@$tokens) { push @$line, $_; if (/^\n$/) { $$line[-1] =~ s/^/$ignore/; push @lines, $line; $line = []; } } if (@line) { die "missing newline at end of stream"; } return \@lines; } sub parse_blocks_and_continuing_lines { my ($lines) = @_; my @out; my $indent = 0; for (@$lines) { my $i = 0; if ($$_[0] =~ /^(\t+)$/) { my $new_indent = length($1); $$_[0] =~ s/^/$ignore/; while ($indent < $new_indent) { push @out, $hidden."in"; ++$indent; } while ($indent > $new_indent) { push @out, $hidden."out"; --$indent; } ++$i; } if ($$_[$i] =~ /^( +)$/) { my $continuing = length($1); $$_[0] =~ s/^/$ignore/; push @out, $hidden."continue"; } push @out, $_; } } sub build_tree { my ($lines) = @_; my @tree; my @stack; my $current_block = \@tree; for (@$lines) { if (!ref $_) { if ($_ eq $hidden."in") { my $new_block = []; push @$current_block, $new_block; push @stack, $current_block; $current_block = $new_block; } elsif ($_ eq $hidden."out") { if (@stack == 0) { die "cannot go out, already at top level"; } $current_block = pop @stack; } elsif ($_ eq $hidden."continue") { if (@$current_block == 0) { die "cannot continue a line, no previous line"; } push @{$$current_block[-1]}, @$_; } } } }