#!/usr/local/bin/perl # Plain English Compiler - Roger Espel Llima (21st Apr 1996) # This program is in the Public Domain. # Description: # # Takes a description of an algorithm in a very specific restricted # sub-language of plain English which is still readable and means the same # to the human than it does to the compiler, compiles it into intermediary # perl code and evaluates it. # This is *extremely* primitive, was put out in a few hours of hacking, # and lacks the most basic support for data scopes, expressions, param # passing, error handling, etc... but it achieves the goal of being # *expressive* (i.e just about any algorithm *can* be translated into it, # in a more or less convoluted and awkward way) and *readable* and meaningful # in English. The key point being, it's the *same* English words that you # understand when reading something written for this compiler that the # compiler is understanding too, in a simplified way, but basically with # the same semantics. # The sole intent of this is to prove in practice that there *cannot* be # a formal distinction between software and speech. Between OCR and (a # better version of) this, a perfectly readable but carefully crafted # written paper describing an algorithm can be evaluated on a computer # without more human intervention than it takes to execute traditional # compiled code from magnetic media. In particular, this applies to # cryptographic algorithms. # Note that the compiler itself is general-purpose and knows *nothing* # of cryptography specifically. Certain common cryptographic algorithms # would be hard (although still possible) to write for this compiler because # they need support for long integers. It would be relatively easy to # directly add general supoprt for these in the compiler, without changing # anything to the concept of it. # An example of "source code" the compiler would understand: # (actually the beginning of the description of the TEA cryptographic # algorithm ; I haven't included the functional parts of it to avoid # export problems). # ------------------ begin understood 'source' code ------------------- # First, we'll ask the user for the key. Then ask for the plaintext. We # call delta the value 2654435769. Then we store the 1st element of the # key into k0, put the second element of the key in k1, put the 3rd # element of the key into k2, and put the fourth element of the key into # k3. Now, let i be 1, and let l be the length of the plaintext. As long # as i isn't greater than l, put the element of the plaintext indexed by i # in y, add 1 to i, put the element of the plaintext indexed by i in z, # add 1 to i again, and then do one encryption step, print out y and # print out z. # # To do one encryption step, we first set n to 32, and set sum to 0. Then, # as long as n is greater than 0, we substract 1 from n, add delta to # sum, set t1 to be z shifted to the left by 4 bits, and then ... # [ rest snipped ] # -------------------- 'end understood source code' ------------------- # Description of the recognized language: # The algorithm is to be described in a set of English sentences constructed # with the following structure: # # . a number of adverbs like 'now', 'again', 'finally', 'too', can be # inserted wherever a verb would be expected, for readability. they # are ignored by the parser. # # . a sentence stops at a period, and is made of one or more statement # separated by commas or semicolons. # # . a statement starts with a verb, optionally preceded by "we" or # "we'll" for readability. # # . for conditionals and loops, the implicit block ranges from the "," # after the conditional expression to the end of the sentence. nested # blocks are not supported, use procedures for this. # # . the first sentences up to a declaration starting with 'to' are the # main program; they are optionally followed by a number of explanations # (procedure declarations) defined by the 'to' statement: # to , # the implicit end of a procedure is the end of the source or the # next encountered procedure definition. this is consistent with # the intuitive meaning of the sentences. # . a procedure is called by starting a sentence with 'do', and the # procedure name is the rest of the statement. # . a is any string that is not a common word; starts with a letter. # . a is a number in decimal notation. # . a is a or a # . an is either "first", "second", "third", "fourth" or a # number followed by 'st', 'nd', 'rd, or 'th'. # . a is either a or one of : # shifted [to the] left|right by [bits] # the element of # the element of indexed by # the length of # . a is one of: # is|isn't greater|smaller than # is|isn't equal to # equals # doesn't equal # the recognized statement forms are: # ask [the user] for [the value(s) [of]] # (it expects a list of numbers separated by commas, which goes # into an array, to be retrived with the 'indexed by' constructions) # call [the value [of]] # store|put into # let be # set to be # as long as , blah, blah, blah. # while , blah, blah, blah. # if blah, blah, blah. # add to # substract from # multiply by # divide by # take modulo # xor by # binary and by # binary or by # print out # do # to blah blah blah... # ------------------------------------------------------------------------ # the compiler: # if we were given a -d, run in debug mode -> show the compiled code # instead of evaluating it $ARGV[0] eq '-d' and $debug=1; # read the text, separate words, ignore some undef $/; chomp($s = ); $/ = "\n"; $s = " ".$s; $s =~ tr/A-Z;/a-z,/; $s =~ s/([.,])/ $1 /sg; $s =~ tr/#//d; $s =~ s/\band\b/ , /sg; $s =~ s/(\bwe\'ll\b)|(\bwe\b)|(\bthen\b)|(\bfinally\b)|(\bnow\b)/ /sg; $s =~ s/(\bagain\b)|(\btoo\b)/ /sg; $s =~ s/( ,)+/ ,/sg; $s =~ s/\s+/ /sg; $s .= " "; # error handling is beyond primitive, but it wasn't really the point # of it all. sub abort { my $n = shift; print "I don't understand '$n'\n"; exit 0; } # a few handy parsing routines sub parse_name { my $n = shift; return $n if $n =~ /^\w\S*$/; abort; } sub parse_val { my $n = shift; return $n if $n =~ /^[-.\d]+$/; return '$'.$n if $n =~ /^\w\S*$/; abort; } sub parse_ordinal { my $n = shift; return 1 if $n eq 'first'; return 2 if $n eq 'second'; return 3 if $n eq 'third'; return 4 if $n eq 'fourth'; # could add a few more if ($n =~ /^\d+/) { return $& if $' eq 'st' || $' eq 'nd' || $' eq 'rd' || $' eq 'th'; } abort; } # reads a sub parse_term { my $n = shift; $n =~ s/^the //; return parse_val($n) unless $n =~ /element / || $n =~ /^length / || $n =~ /shifted / || $n =~ /indexed/; return '(' . &parse_val($1) . '<<' . &parse_val($2) . ')' if $n =~ /^(\S+) shifted (?:to the )?left by (\S+)(?: bits)\s*$/; return '(' . &parse_val($1) . '>>' . &parse_val($2) . ')' if $n =~ /^(\S+) shifted (?:to the )?right by (\S+)(?: bits)\s*$/; return '$#'.&parse_name($1) if $n =~ /^length of (?:the )?(\S+)\s*$/; return '$#'.&parse_name($1) if $n =~ /^length of (?:the )?(\S+)\s*$/; return '$' . $2 . '[' . &parse_ordinal($1) . ']' if $n =~ /^(\S+) element of (?:the )?(\S+)\s*$/; return '$' . $1 . '[' . &parse_val($2) . ']' if $n =~ /^element of (?:the )?(\S+) indexed by (?:the )?(\S+)\s*$/; abort; } # reads a sub parse_cond { my $s = shift; $s =~ s/^the //; return &parse_val($1) . ">" . &parse_val($2) if $s =~ /^(\S+) is greater than (\S+) $/; return &parse_val($1) . "<=" . &parse_val($2) if $s =~ /^(\S+) isn\'t greater than (\S+) $/; return &parse_val($1) . "<" . &parse_val($2) if $s =~ /^(\S+) is smaller than (\S+) $/; return &parse_val($1) . ">=" . &parse_val($2) if $s =~ /^(\S+) isn\'t smaller than (\S+) $/; return &parse_val($1) . "==" . &parse_val($2) if $s =~ /^(\S+) equals (\S+) $/ || $s =~ /^(\S+) is equal to (\S+) $/; return &parse_val($1) . "!=" . &parse_val($2) if $s =~ /^(\S+) doesn\'t equal (\S+) $/ || $s =~ /^(\S+) isn't equal to (\S+) $/; abort; } # define &input in the compiled code, so it knows how to input # lists of numbers $code = 'sub input { $_ = ; s/\,/ /g; s/^ +//; my (@s) = (0, split(/ +/, $_)); return @s; } '; # compile a statement sub parse_statement { $s =~ s/^ +//; 1 while $s =~ s/^\, // || $s =~ s/^\. // || $s =~ s/^first //; if ($s =~ /^ask (?:the )?(?:user )?for (?:the )?(?:value )?(?:of )?(\S+) /) { $s = $'; my ($n) = &parse_name($1); $code .= "print '$n? '; \@". $n . "=&input;\n"; } elsif ($s =~ /^call (\S+) (?:the )?(?:value (?:of )?)?([^.,]+) / || $s =~ /^set (\S+) to (?:be )?(?:the )?(?:value (?:of )?)?([^.,]+) /) { $s = $'; $code .= '$'.&parse_name($1) . '=' . &parse_term($2).";\n"; } elsif ($s =~ /^store (?:the )?(?:value (?:of )?)?([^.,]+) in(?:to)? (\S+) / || $s =~ /^put (?:the )?(?:value (?:of )?)?([^.,]+) in(?:to)? (\S+) /) { $s = $'; $code .= '$'.&parse_name($2) . '=' . &parse_term($1).";\n"; } elsif ($s =~ /^let (\S+) be (?:the )?(?:value (?:of )?)?([^.,]+) /) { $s = $'; $code .= '$'.&parse_name($1) . '=' . &parse_term($2).";\n"; } elsif ($s =~ /^add (?:the )?(?:value (?:of )?)?(\S+) to (\S+) /) { $s = $'; $code .= '$'.&parse_name($2) . '+=' . &parse_val($1).";\n"; } elsif ($s =~ /^substract (?:the )?(?:value (?:of )?)?(\S+) from (\S+) /) { $s = $'; $code .= '$'.&parse_name($2) . '-=' . &parse_val($1).";\n"; } elsif ($s =~ /^multiply (\S+) by (?:the )?(?:value (?:of )?)?(\S+) /) { $s = $'; $code .= '$'.&parse_name($1) . '*=' . &parse_val($2).";\n"; } elsif ($s =~ /^divide (\S+) by (?:the )?(?:value (?:of )?)?(\S+) /) { $s = $'; $code .= '$'.&parse_name($1) . '/=' . &parse_val($2).";\n"; } elsif ($s =~ /^take (\S+) modulo (?:the )?(?:value (?:of )?)?(\S+) /) { $s = $'; $code .= '$'.&parse_name($1) . '%=' . &parse_val($2).";\n"; } elsif ($s =~ /^xor (\S+) by (?:the )?(?:value (?:of )?)?(\S+) /) { $s = $'; $code .= '$'.&parse_name($1) . '^=' . &parse_val($2).";\n"; } elsif ($s =~ /^binary or (\S+) by (?:the )?(?:value (?:of )?)?(\S+) /) { $s = $'; $code .= '$'.&parse_name($1) . '|=' . &parse_val($2).";\n"; } elsif ($s =~ /^binary \, (\S+) by (?:the )?(?:value (?:of )?)?(\S+) /) { $s = $'; $code .= '$'.&parse_name($1) . '&=' . &parse_val($2).";\n"; } elsif ($s =~ /^print (?:out )?(?:the )?(?:value (?:of )?)?(\S+) /) { $s = $'; $code .= 'print ' . &parse_val($1) . ', "\n";' . "\n"; } elsif ($s =~ /^if ([^,.]+)\, /) { $s = ", ".$'; abort("nested blocks") if $inblock; $inblock = 1; $code .= 'if (' . &parse_cond($1) . ") {\n"; } elsif ($s =~ /^while ([^,.]+)\, / || $s =~ /^as long as ([^,.]+)\, /) { $s = ", ".$'; abort("nested blocks") if $inblock; $inblock = 1; $code .= 'while (' . &parse_cond($1) . ") {\n"; } elsif ($s =~ /^do ([^,.]+)/) { $s = $'; my ($p) = $1; $p =~ s/\s//g; $code .= '&' . $p . ";\n"; } elsif ($s =~ /^to (?:do )?([^,.]+)/) { $s = $'; my ($p) = $1; $p =~ s/\s//g; $code .= "}\n" if $insub; $code .= "sub $p {\n"; $insub = 1; } if ($s =~ s/^\. // || $s =~ /^\s*$/) { $code .= "}\n", $inblock = '' if $inblock; } elsif ($s !~ s/^\, //) { abort($s); } } # main loop: parse statements until we're done. while ($s ne '') { parse_statement; } # close a sub if we were in one. $code .= "}\n" if $insub; if ($debug) { print $code, "\n"; } else { # do it eval $code; # and complain if something went wrong. print "error evaluating code...\n" if $@; } __END__