|
|
A.3 Generate.Input
#!/usr/local/bin/perl -w
#
# CLASPnet
#
# English sentence generator for my M.Sc. Dissertation
# Copyright 1995-1996. All rights reserved. Ezra Van Everbroeck
#
# 960425 : remove minor punctuation error
# 960409 : change DO to TO coding
# 960406 : add semantics to standard.voc parsing
# 960322 : more stylistic changes
# 960318 : added voice recognition
# reduced word length to 12 characters: 7+5
# 960317 : added DI clause type
# 960313 : added finite vs infinite
# added .rot files (supersede .cls files)
# 960310 : added be-constructions
# 960305 : changed orthographic vocabulary input to 8+7
# 960229 : add orthographic vocabulary input 3+9+3
# 960226 : new parser (part 2)
# 960224 : new SGML-ish grammar so new parser (part 1)
# 960220 : more work on levels analysis (part 2)
# 960219 : support for multiple file locations on many machines
# add level of subclause support (part 1)
# 960214 : fix main clause unit 1 for part connector clauses bug
# add 'the more ... the less' construction
# add 'however/no matter how ...' construction
# 960212 : fix problem with finding standard grammar
# 960204 : continuing translation + extra features
# 960203 : translation from old AWK + shell scripts to Perl
#
##
# Set the global stuff
##
@ran_gram = (); @ran_voc = (); @sen_out = ();
@io_pats = (); $pat_file = ""; $len_total = 0;
$sen_default = 100; # default number of sentences
$len_default = 20; # default length of sentences
$len_input = 85; # number of input units per word
$len_output = 17; # number of output units per word
chop($hostname = `hostname`);
if ($hostname eq "onyx") {
$snns_tools_dir = "/home/admin/ezra/zin/snns/tools/bin/pc_linux";
# $snns_xgui_dir = "/home/admin/ezra/zin/snns/xgui/bin/pc_linux";
} elsif ($hostname eq "twentie") {
$snns_tools_dir = "/tartape/SNNSv4.1/tools/bin/sun_os4x";
# $snns_xgui_dir = "/tartape/SNNSv4.1/xgui/bin/pc_linux";
} elsif ($hostname eq "shivan.magic.net") {
$snns_tools_dir = "/local/ann/SNNSv4.1/tools/bin/pc_linux";
# $snns_xgui_dir = "/tartape/SNNSv4.1/xgui/bin/pc_linux";
}
##
# Main routine
##
&intro;
&get_grammar;
&get_voc;
&gen_sentences;
&find_output;
&make_snns;
&finale;
##
# Print intro message
##
sub intro {
print <<END
+-+-+ Starting the Marvelous English Sentence Generator +-+-+
+-+-+ Copyright 1995-1996. Ezra Van Everbroeck +-+-+
END
} # end &intro
##
# Get grammar : from file or make new random one
##
sub get_grammar {
my($ran_num, @raw_gram);
print "Use which grammar file (new name allowed)? ";
chop($grammar_file = <STDIN>);
unless ($grammar_file =~ /\w+.grm$/) { $grammar_file = $grammar_file . ".grm" };
# if it doesn't exist, create new random order grammar
$0 =~ /(\S+)\/\S+/;
$path = $1;
unless (-f $grammar_file) {
open(STD_GRAM,"${path}/standard.grm");
while (<STD_GRAM>) { push(@raw_gram,$_); }
close(STD_GRAM);
srand;
@ran_gram = sort { rand(10000) <=> rand(10000) } @raw_gram;
open(OUT_GRAM,">./${grammar_file}");
foreach (@ran_gram) {
print OUT_GRAM;
}
close(OUT_GRAM);
print "++ Made new grammar file: $grammar_file\n";
# if it does, tell user
} else {
print "++ Using existing grammar file: $grammar_file\n";
open(IN_GRAM,"./${grammar_file}");
while (<IN_GRAM>) { push(@ran_gram,$_); }
close(IN_GRAM);
}
} # end &get_grammar
##
# Generate sentences on the basis of the grammar
##
sub gen_sentences {
my($sen_have, $sen_new, $sen_count, @sen_new_parts, $sen_file, $sen);
local($lhs_count, %lhs, %rhs_count, %rhs);
# check grammar
foreach (@ran_gram) {
chop;
@_ = split(/[ \t]+/,$_);
if ($_[1] eq "->") {
$lhs_count = ++$lhs{$_[0]} ;
$rhs_count{$_[0],"-",$lhs_count} = $#_-1;
for ( $j=2 ; $j<=$#_ ; $j++ ) {
$rhs{$_[0], "-", $lhs_count, "-", $j-1} = $_[$j];
}
} else {
die "Illegal instruction: $_ !\n";
}
} ;
# decide length and number of sentences wanted
print "Maximum number of words per sentence [$len_default]? ";
chop($len_want = <STDIN>); unless ($len_want) { $len_want = $len_default; }
print "Number of sentences to create [$sen_default]? ";
chop($sen_want = <STDIN>); unless ($sen_want) { $sen_want = $sen_default; }
$sen_have = 0;
# generate until we have enough
until ($sen_have >= $sen_want) {
$sen_new = &generate("s");
$sen_new =~ s/\s+/ /g;
# some semantic / stylistic patches (needs own subroutine)
$sen_new =~ s/do never/never/g;
$sen_new =~ s/do be/be/g;
$sen_new =~ s/do it be/be it/g;
$sen_new =~ s/do it not be/be it not/g;
$sen_new =~ s/do it never be/be it never/g;
# only count words
$sen_count = $sen_new;
$sen_count =~ s/<\S+>//g;
$sen_count =~ s/[,?!.]//g;
#print "+ $sen_count\n";
@sen_new_parts = split(/ +/,$sen_count);
shift(@sen_new_parts);
$len_new = @sen_new_parts;
# only short enough sentences qualify
unless ($len_new > $len_want) {
push(@sen_out,$sen_new);
#print STDOUT "$sen_new\n";
++$sen_have;
$len_total += $len_new;
# tell user what is happening
if ($sen_want > 100) {
unless ($did_it) { print STDERR "++ Creating: " ; $did_it = 1 }
if (($sen_have % 100) == 0) { print STDERR ". $sen_have " }
} else {
unless ($did_it) { print STDERR "++ Creating: "; $did_it = 1 }
if (($sen_have % 10) == 0) { print STDERR ". $sen_have " }
}
}
}
print "\n";
# offer to create file with all the sentences
($def_file = $grammar_file) =~ s/\.grm$//;
print "Use which name for sentences file [${def_file}.sen]? ";
chop($sen_file = <STDIN>); $sen = $sen_file;
unless ($sen_file) { $sen_file = $def_file . ".sen" } ;
unless ($sen_file =~ /\w+.sen$/) { $sen_file = $sen_file . ".sen" };
open(SEN_FILE,">./$sen_file");
foreach(@sen_out) {
print SEN_FILE "$_\n";
}
close(SEN_FILE);
# change default file name if appropriate
if ($sen) { $def_file = $sen; }
} # end &gen_sentences
##
# Real generation
##
sub generate {
my($symbol, $var1, $var2, $sen_grow, $sen_grow2) = @_ ;
$sen_grow = "";
if (defined $lhs{$symbol}) {
$var1 = int($lhs{$symbol} * rand(1))+1;
for ( $var2=1 ; $var2<=$rhs_count{$symbol,"-",$var1} ; $var2++ ) {
$sen_grow2 = &generate($rhs{$symbol,"-",$var1,"-",$var2});
$sen_grow = $sen_grow . " " . $sen_grow2;
}
return $sen_grow;
} else {
$sen_grow = $sen_grow . " " . $symbol;
return $sen_grow ;
}
} # end &generate
##
# Find correct output for each clause of each sentence
##
sub find_output {
my(@bits,$end,$clause_ok,@ubits,$ufo,@ubits2,$cls_file,$level,%dec2bin);
my($routput);
%dec2bin = ( "0" => "0 0 0 ", "1" => "1 0 0 ", "2" => "0 1 0 ",
"3" => "1 1 0 ", "4" => "0 0 1 ", "5" => "1 0 1 ",
"6" => "0 1 1 ", "7" => "1 1 1 ");
$level = 0;
$id2out{"0"} = "1 0 0 0 0 0 0 0 0 0 0";
open(ROT_FILE,">./${def_file}.rot");
print "++ Saving results output file as well: ${def_file}.rot\n";
foreach $sentence (@sen_out) {
# remove bad punctuation
$sentence =~ s/\. (,|and|or)/$1/g;
$sentence =~ s/! (,|and|or)/$1/g;
$sentence =~ s/\? (,|and|or)/$1/g;
$sentence =~ s/(R[EI]>) , (<\/R[EI])/$1 $2/g;
$sentence =~ s/, (<\/WH> \?)/$1/;
$sentence =~ s/, (<\/YN> \?)/$1/;
$sentence =~ s/, (<\/..>) \./$1 ./g;
# split in parts
@parts = split(' ',$sentence);
# find mood
$p_mood = &find_mood($parts[$#parts]);
# start parsing
foreach (@parts) {
s/\s*(\S+)\s*/$1/;
if ($_ =~ /<\w/) {
++$id;
$p_out = &read_sgml($_);
$id2out{"$id"} = $p_out;
} elsif ($_ =~ /^<\//) {
$id--;
$p_out = $id2out{"$id"};
if ($id == 0) { ++$id; }
} elsif ($_ =~ /(\.|\?|!|,)/) {
$output = "0 " x $len_output;
$bin_pat = &bin_words($_);
push(@io_pats,"$bin_pat\n$output\n");
$routput = "0 0 0 - 0 0 0 - 0 - 0 0 0 0 0 0 0 - 0 - 0 - 0";
printf ROT_FILE ("%-20s%s\n",$_,$routput);
# push(@io_pats,"$_\n$output\n");
unless ($_ =~ ",") { $id = 0 ; }
} else {
$p_id = $dec2bin{$id};
$bin_pat = &bin_words($_);
$output = $p_mood . $p_id . $p_out;
push(@io_pats,"$bin_pat\n$output\n");
$routput = &rout($output);
printf ROT_FILE ("%-20s%s\n",$_,$routput);
# push(@io_pats,"$_\n$output\n");
}
}
# print "$_ \n";
}
close(ROT_FILE);
} # end &find_output
##
# Make more readable desired output for .rot file
##
sub rout {
my(@u) = split(' ',$_[0]);
my($rout);
$rout =
"$u[0] $u[1] $u[2] - $u[3] $u[4] $u[5] - $u[6] - $u[7] $u[8] $u[9] $u[10] " .
"$u[11] $u[12] $u[13] - $u[14] - $u[15] - $u[16]";
return $rout;
} #end &rout
##
# Read an SGML pattern to get all the values for that clause
##
sub read_sgml {
my($sgml) = $_[0];
my(%func2bin) = (
"DE" => "1 0 0 0 0 0 0 0", "EX" => "0 1 0 0 0 0 0 0",
"WH" => "0 0 1 0 0 0 0 0", "YN" => "0 0 0 1 0 0 0 0",
"RE" => "0 0 0 0 1 0 0 0", "CO" => "0 0 0 0 0 1 0 0",
"HO" => "0 0 0 0 0 1 0 0", "TM" => "0 0 0 0 0 1 0 0",
"TH" => "0 0 0 0 0 0 1 0", "TO" => "0 0 0 0 0 0 1 1",
"EO" => "0 1 0 0 0 0 0 1", "CT" => "0 0 0 0 0 1 0 1",
"RI" => "0 0 0 0 1 0 0 1", "BD" => "1 0 0 0 0 0 0 0",
"BY" => "0 0 0 1 0 0 0 0", "BW" => "0 0 1 0 0 0 0 0",
"BI" => "0 1 0 0 0 0 0 0", "BN" => "0 0 0 1 0 0 0 0",
"BT" => "0 0 0 0 0 0 1 0", "DI" => "1 0 0 0 0 0 0 1",
);
my(%stat2bin) = ("M" => "1 ", "S" => "0 ");
my(%vois2bin) = ("A" => " 1", "P" => " 0");
my(%pola2bin) = ("P" => " 1", "N" => " 0");
# count them (to get number of clauses)
++$num_clauses;
# parse elements in SGML tag
$sgml =~ /<([A-Z]+)_([\w=]+)_(\w)>/;
$p_func = $func2bin{$1};
$p_stat = $stat2bin{$3};
# print "SGML: $sgml - $1 - $2 - $3\n";
@tags = split(/_/,$2);
foreach (@tags) {
($tag, $value) = split(/=/,$_);
$tag{"$tag"} = $value;
}
$p_pola = $pola2bin{$tag{"PO"}};
$p_vois = $vois2bin{$tag{"VO"}};
#print STDERR "$sgml: " . $p_stat . $p_func . $p_pola . "\n";
$sgml_out = $p_stat . $p_func . $p_vois . $p_pola;
return $sgml_out;
} # end &read_sgml
##
# Find mood
##
sub find_mood {
my($final) = $_[0];
if ($final eq ".") { $mood = "1 0 0 "; }
if ($final eq "!") { $mood = "0 1 0 "; }
if ($final eq "?") { $mood = "0 0 1 "; }
if ($final =~ /\w+/) { die "Word final element is: \"$final\" \n" }
return $mood;
} # end &find_mood
##
# Clean SGML tags to obtain the sentence (bits)
##
sub clean {
($clean = $_[0]) =~ s/<[a-zA-Z=\(\)\.,\- 0-9\/]+>//g;
$clean =~ s/^ //;
return $clean;
} # end &clean
##
# Get vocabulary from file and make new version (if necessary)
##
sub get_voc {
my($ran_num, @raw_voc, $i);
($def_file = $grammar_file) =~ s/\.grm$//;
print "Use which vocabulary file (new name allowed) [${def_file}.voc]? ";
chop($voc_file = <STDIN>);
unless ($voc_file) { $voc_file = $def_file . ".voc" } ;
unless ($voc_file =~ /\w+.voc$/) { $voc_file = $voc_file . ".voc" };
# if it doesn't exist, create new vocabulary file from the standard one
unless (-f $voc_file) {
open(STD_VOC,"${path}/standard.voc");
while (<STD_VOC>) {
chomp;
if ($_ =~ /0/) {
s/1\./1/g;
s/\.0/0/g;
$ortho = &find_ortho($voc_line);
push(@ran_voc,"$voc_line -> ${ortho}$_\n");
} else {
s/\s*//g;
$voc_line = $_;
}
}
close(STD_VOC);
# foreach (@ran_voc) { print "$_ \n"; }
open(OUT_VOC,">./${voc_file}");
foreach (@ran_voc) {
print OUT_VOC;
chop;
($word, $pattern) = split(/ -> /,$_);
$convert{$word} = $pattern;
}
close(OUT_VOC);
print "++ Made new vocabulary file: $voc_file\n";
# if it does, tell user
} else {
print "++ Using existing vocabulary file: $voc_file\n";
open(IN_VOC,"./${voc_file}");
while (<IN_VOC>) {
chop;
($word, $pattern) = split(/ -> /,$_);
$convert{$word} = $pattern;
}
close(IN_VOC);
}
} # end &get_voc
##
# Find orthographic output for each word in the vocabulary
##
sub find_ortho {
my($word) = $_[0];
my($left,$rait,$bin,$ch);
my($lin) = length($word);
my(%conv) = (
" " => "00000",
"a" => "10000", "b" => "01000", "c" => "11000",
"d" => "00100", "e" => "10100", "f" => "01100",
"g" => "11100", "h" => "00010", "i" => "10010",
"j" => "01010", "k" => "11010", "l" => "00110",
"m" => "10110", "n" => "01110", "o" => "11110",
"p" => "00001", "q" => "10001", "r" => "01001",
"s" => "11001", "t" => "00101", "u" => "10101",
"v" => "01101", "w" => "11101", "x" => "00011",
"y" => "10011", "z" => "01011", "," => "11011",
"." => "00111", "?" => "10111", "!" => "01111",
"'" => "11111"
);
if ($lin < 6) {
$left = $word . " " x (7 - $lin);
$rait = " " x (5 - $lin) . $word;
} elsif ($lin == 6) {
$left = $word . " ";
$rait = substr($word,1,5);
} else {
$left = substr($word,0,7);
$rait = substr($word,($lin-5),5);
}
$bin = $left . $rait;
$ch = $bin;
for (keys %conv) {
if ($conv{$_}) {
$ch =~ s/\Q$_\E/$conv{$_}/g;
} else {
die "!! $_ can not be binified ... aborting ...\n";
}
}
$ch =~ s/((0|1))/$1 /g;
#print "$ch\n";
return $ch;
} # end &find_ortho
##
# Replace the words by the binary patterns from the voc
##
sub bin_words {
my($word) = $_[0];
$convert{$word} or die "!! No vocabulary pattern for: \"$word\". Aborting ...\n\n";
$word =~ s/\Q$word\E/${convert{$word}}/;
return $word;
} # end &bin_words
##
# Make appropriate SNNS header for the I/O patterns
##
sub make_snns {
my($num_pats);
$num_pats = @io_pats;
# get name for pattern file
print "Name for SNNS patterns file (will overwrite) [${def_file}.pat]? ";
chop($pat_file = <STDIN>);
unless ($pat_file) { $pat_file = $def_file . ".pat" } ;
unless ($pat_file =~ /\w+.pat$/) { $pat_file = $pat_file . ".pat" };
# make SNNS header
system("${snns_tools_dir}/mkhead $num_pats $len_input $len_output > ./$pat_file");
# add patterns
open(PAT_FILE,">>./$pat_file");
foreach (@io_pats) { print PAT_FILE }
close(PAT_FILE);
print "++ Created pattern file.\n";
} # end &make_snns
##
# Finale : give some statistics
##
sub finale {
$time = localtime;
$num_avg = ($len_total / $sen_want);
($inf_file = $pat_file) =~ s/pat$/inf/;
print "\n++ Writing summary file: $inf_file\n";
open(INF_FILE,">./$inf_file");
print INF_FILE "
+-+-+ Starting the Marvelous English Sentence Generator +-+-+
+-+-+ Copyright 1995-1996. Ezra Van Everbroeck +-+-+
Created with $0 on $time
Pattern file is $pat_file
It contains $sen_want sentences (and $num_clauses clauses)
It used a maximum sentence length of $len_want words (average is $num_avg words)
It used grammar file $grammar_file and vocabulary file $voc_file
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Grammar file:
";
foreach (@ran_gram) { print INF_FILE "$_\n"; }
print INF_FILE "
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Vocabulary file:
";
foreach (@ran_voc) { print INF_FILE "$_\n"; }
print INF_FILE "
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
";
close(INF_FILE);
print "++ Done.\n\n";
} # end &finale