#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Pod::Usage;
use Util;
use Carp;
#assumes $modelDir/tmp dir exists, assumes that $in and $out are files
#run until improvment is less than eps
sub herest{
	(my $cmd, my $in, my $out, my $eps, my $modelDir) = @_;
	(my $lastAvgProb, my $nextAvgProb, my $itCounter) = (-1,-1,0);
	#run it once
	my @cmdList = split(/ /, $cmd);
	my $hmmList = pop @cmdList;
	my @cmdListFirstTime = @cmdList;
	push @cmdListFirstTime, ("-H", $in, "-M ", "$modelDir/tmp", $hmmList, " | tee $modelDir/herest.log | grep '^[^ ].*'");
	push @cmdList, ("-H", $in, "-M ", "$modelDir/tmp", $hmmList, " | grep '^[^ ].*' | tee $modelDir/herest.log");
	ecsystem(join(' ',@cmdListFirstTime));
	#pick up the words with more phones than letters - these indicate phone models that need teeing
	ecsystem("cat $modelDir/herest.log | grep -E '^( WARNING \\\[-7324\\\] ).*' |  sed -e 's/^\\\( WAR\\\).* File //' -e 's/\\\.dsc - .*\$/\\\.lab/' > $modelDir/wordsNeedTeeing.txt");
	#exit;
	my $getProbCmd = "cat $modelDir/herest.log  | grep Reestimation | " . 'sed \'s/.*frame = \(.*\)$/\1/\'';
	$nextAvgProb = `$getProbCmd`;
	my $isLog = `cat $modelDir/herest.log  | grep 'Reestimation .* log prob '`;
	if ($isLog ne ""){
		$nextAvgProb = exp($nextAvgProb);
	}
	confess("HERest terminated with error") if ($nextAvgProb eq "");
	$itCounter++;

	#run it a few more times
	$cmdList[-5]="$modelDir/tmp/*";
	my $iterCmd = join(' ',@cmdList);
	while ($nextAvgProb-$lastAvgProb > $eps){
		$lastAvgProb = $nextAvgProb;
		ecsystem($iterCmd);
		$nextAvgProb = `$getProbCmd`;
		$itCounter++;
		$isLog = `cat $modelDir/herest.log  | grep 'Reestimation .* log prob '`;
		if ($isLog ne ""){
			$nextAvgProb = exp($nextAvgProb);
		}
	}
	print "   herest iterated $itCounter times, eps = ".($nextAvgProb-$lastAvgProb)."\n";
	ecsystem("mv -v $modelDir/tmp/* $out");
}



####################################################################
my $modelDir;
my $configDir;
my $dictFile;
my $startStage=1;
my $help;
my $params = join (' ', @ARGV);
GetOptions (
			"modelDir|m=s"  => \$modelDir,
			"dict|d=s"  => \$dictFile,
			"configDir|c=s"  => \$configDir,
			"startStage|s=s" => \$startStage,
            "help|h" => \$help) 
or pod2usage(2);

pod2usage(-exitstatus => 2, -verbose => 3) if $help;

pod2usage(2) if !($dictFile and $configDir and $modelDir);



$|=1; #flush after every print


die "cannot access config dir $configDir"  if(! -e "$configDir");
die "cannot access dictionary $dictFile"  if(! -e "$dictFile");

if(! -e "$modelDir"){
	ecsystem("mkdir -v $modelDir");
}

if(! -e "$modelDir/words"){
	ecsystem("mkdir -v $modelDir/words");
}

if(! -e "$modelDir/tmp"){
	ecsystem("mkdir -v $modelDir/tmp");
}
else{
	ecsystem("rm -rf $modelDir/tmp/*");
}

#save the config state

open (PARAMS, ">$modelDir/params.txt") || die "cannot open >$modelDir/params.txt";
print PARAMS "$params";
close PARAMS;

$configDir =~ /.*\/([^\/]*.+)$/;
my $cfgDirName = $1;
#print "cfgDirName $cfgDirName $1\n";
mkdir "$modelDir/$cfgDirName" || die "cannot mkdir $modelDir/$cfgDirName";
print `cp -rf $configDir/*?.* $modelDir/$cfgDirName`; #copy everything except hidden files and dirs


#done with reading the settings. start the real script now.
goto "_$startStage";

_1:
print "***STAGE 1: preparing training data and 'language model'...\n";
_1_1:
print "** STAGE 1_1: converting dictionary into HTK readable training data...\n";
ecsystem("dictToHTK.pl -d $dictFile -t $modelDir");
#ecsystem("grep ^[^\\\".#] $modelDir/dict.mlf | sort -u > $modelDir/alphabet.txt");
ecsystem("sed -e 's/^[^ ]* //' -e's/  */\\n/g' $modelDir/cleanDict.txt | sort -u > $modelDir/phonebet.txt");
_1_2:
print "** STAGE 1_2: preparing phone lattice (backed-off bi-phones)...\n";
ecsystem("HLStats -T 1 -A -D -o -C $configDir/bigramStats.hls -b $modelDir/biphone.mtr $modelDir/phonebet.txt $modelDir/dict.mlf");
#HLStats is buggy - some times you get a negative probability (a positive log probability)
#-t 3 seems to fix it.
ecsystem("cp $modelDir/phonebet.txt $modelDir/phonebetEnterExit.txt");
ecsystem("echo -e '!ENTER\n!EXIT' >> $modelDir/phonebetEnterExit.txt");
ecsystem("HBuild -T 1 -A -D -n $modelDir/biphone.mtr  $modelDir/phonebetEnterExit.txt $modelDir/biphone.slf");
_1_3:
print "** STAGE 1_3: preparing dictionary (each word is a phone)...\n";
ecsystem("paste $modelDir/phonebet.txt $modelDir/phonebet.txt >$modelDir/phoneDict.dic");
ecsystem("echo -e '!ENTER\t[]\n!EXIT\t[]' >> $modelDir/phoneDict.dic");
print "***STAGE 1: Done\n";

_2:
print "***STAGE 2: training monophone letter-to-phone HMM.\n";

_2_1:
print "** STAGE 2_1: bootstrapping ininital HMM models...\n";
my $defCnt = `cat $modelDir/dict.scp | wc -l `;
chomp $defCnt;
my $letterCnt = `cat $modelDir/alphabet.txt | wc -l `;
chomp $letterCnt;
my $phoneCnt = `cat $modelDir/phonebet.txt | wc -l `;
chomp $phoneCnt;
print "$defCnt words\n$letterCnt letters\n$phoneCnt phones\n";

my $scaledProb = int(log(1/$letterCnt)*-2371.8);

#FIXME: should the obs probabilities be intialized to the average occurance of letters?
my $monoPrototype = << "EOF";
~o <VecSize> 1 <DISCRETE>
~h "monophoneProto"
<BeginHMM>
  <NumStates> 8
  <State> 2 
      <NumMixes> $letterCnt
      <DProb> $scaledProb*$letterCnt
  <State> 3
      <NumMixes> $letterCnt
      <DProb> $scaledProb*$letterCnt
  <State> 4 
      <NumMixes> $letterCnt
      <DProb> $scaledProb*$letterCnt
  <State> 5
      <NumMixes> $letterCnt
      <DProb> $scaledProb*$letterCnt
  <State> 6 
      <NumMixes> $letterCnt
      <DProb> $scaledProb*$letterCnt
  <State> 7
      <NumMixes> $letterCnt
      <DProb> $scaledProb*$letterCnt	
  <TransP> 8
	0.0	0.4	0.3	0.0	0.3	0.0	0.0	0.0
	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0
	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0
	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
<EndHMM>
EOF
open (PROTO, ">$modelDir/proto.hdf") || die "$!: cannot open >$modelDir/proto.hdf";
print PROTO $monoPrototype;
close PROTO;

#fixme.  This HMM gives out only letter _WB_ with probability 1 and nothing else. 
#It happens to be true that the first by letter seen is always _WB_ and so 
#_WB_ corresponds to value 1
my $letterCntLessOne=$letterCnt-1;
my $wordBoundaryHmm = << "EOF";
~o <VecSize> 1 <DISCRETE>
~h "wb"
<BeginHMM>
  <NumStates> 3
  <State> 2 
      <NumMixes> $letterCnt
      <DProb> 0 32767*$letterCntLessOne
  <TransP> 3
	0.0	1.0	0.0
	0.0	0.0	1.0
	0.0	0.0	0.0
<EndHMM>
EOF

open(PHONEBET, "<$modelDir/phonebet.txt") ||  die "$!: cannot open <$modelDir/phonebet.txt";
mkdir "$modelDir/monophone0/" || die "cannot create dir $modelDir/monophone0/";

open (WBHMM, ">$modelDir/monophone0/wb.hdf") || die "$!: cannot open >$modelDir/monophone0/wb.hdf";
print WBHMM $wordBoundaryHmm;
close WBHMM;

while(<PHONEBET>){
	chomp;
	my $phone = $_;
	next if ($phone eq 'wb');
	ecsystem("sed -e \"s/monophoneProto/$phone/\" $modelDir/proto.hdf > $modelDir/monophone0/$phone.hdf");
}
close PHONEBET;
ecsystem("cat $modelDir/monophone0/* > $modelDir/monophone0.hdf");

_2_2:
print "** STAGE 2_2: HEResting the HMM ...\n";
herest("HERest -T 1 -A -D -S $modelDir/dict.scp -I $modelDir/dict.mlf $modelDir/phonebet.txt","$modelDir/monophone0.hdf", "$modelDir/monophone1.hdf", .01, $modelDir);

print "***STAGE 2:  done.\n";

_3:
print "***STAGE 3: training triphone letter-to-phone HMM.\n";

_3_1:
print "** STAGE 3_1: creating triphone label and hmm definition files...\n";
ecsystem("HLEd -T 2 -A -n $modelDir/triPhones.txt -l '*' -i $modelDir/dictTriPhones.mlf $configDir/mktri.led $modelDir/dict.mlf");
ecsystem("grep -E \'^([^w]|w[^b]|[^w][^b]).*\$\' $modelDir/triPhones.txt  > $modelDir/triPhonesNoWb.txt");
my $mktriHed = << "EOF";
CL $modelDir/triPhonesNoWb.txt
EOF
open (MKTRIPHONESHED, ">$modelDir/mktri.hed") ||  die "$!: cannot open >$modelDir/mktri.hed";
print MKTRIPHONESHED $mktriHed;
close MKTRIPHONESHED;
ecsystem("HHEd -T 2 -A -H $modelDir/monophone1.hdf -M $modelDir/tmp $modelDir/mktri.hed $modelDir/phonebet.txt");
ecsystem("mv -v $modelDir/tmp/monophone1.hdf $modelDir/triphone2.hdf");
#ecsystem("cat $modelDir/triphone2.hdf $modelDir/monophone0/wb.hdf > $modelDir/triphone3.hdf");


_3_2:
print "** STAGE 3_2: smoothing bigram model...\n";
ecsystem("HLStats -T 1 -A -D -o -C $configDir/bigramTriPhoneStats.hls -b $modelDir/bigramtriphone.mtr $modelDir/triPhones.txt $modelDir/dictTriPhones.mlf");
ecsystem("smoothedBigram.pl -s 1 -m $modelDir -a $modelDir/bigramAdj.pld $modelDir/bigramtriphone.mtr > $modelDir/bigramtriphoneSmoothed.mtr");

_3_3:
print "** STAGE 3_3: teeing triphones which may produce no letters...\n";
#Tee the triphones most likely not to emit a letter. 2290 out of 127002 words in the CMU are affected by this
ecsystem("HLEd -A -D -T 2 -I $modelDir/dictTriPhones.mlf -i $modelDir/wordsNeedTeeing.mlf $configDir/blank.txt -S $modelDir/wordsNeedTeeing.txt");
my $teeableWords = `cat $modelDir/wordsNeedTeeing.txt | wc -l`;
chomp $teeableWords;
print "$teeableWords words have more phones than letters\n";
ecsystem("grep -e '^\\(\\([^#\".w]\\)\\|\\(w[^b]\\)\\).*' $modelDir/wordsNeedTeeing.mlf | sort | uniq -c | sort -r > $modelDir/teeableTriphones.txt");
ecsystem("calcTees.pl $modelDir/bigramAdj.pld $modelDir/teeableTriphones.txt > $modelDir/teedTriphones.txt");
ecsystem("sed -e 's/\\(.*\\)/AT 1 8 0.01 {\\1.transP}/' $modelDir/teedTriphones.txt > $modelDir/tee.hed");
ecsystem("HHEd -A -H $modelDir/triphone2.hdf -M $modelDir/tmp $modelDir/tee.hed $modelDir/triPhonesNoWb.txt");
ecsystem("cat $modelDir/tmp/triphone2.hdf $modelDir/monophone0/wb.hdf > $modelDir/triphone3.hdf");
ecsystem("rm $modelDir/tmp/triphone2.hdf");

_3_4:
print "** STAGE 3_4: preparing triphone lattice (bigrams of triphones language model)...\n";
ecsystem("cp $modelDir/triPhones.txt $modelDir/triPhonesEnterExit.txt");
ecsystem("echo -e '!ENTER\n!EXIT' >> $modelDir/triPhonesEnterExit.txt");
ecsystem("HBuild -T 1 -A -D -n $modelDir/bigramtriphoneSmoothed.mtr  $modelDir/triPhonesEnterExit.txt $modelDir/bigramtriphone.slf");

_3_5:
print "** STAGE 3_5: preparing dictionary (each word is a tri-phone)...\n";
ecsystem("makeTriphoneDict.pl $modelDir/phonebet.txt $modelDir/triPhones.txt > $modelDir/triphoneDict.dic");
ecsystem("echo -e '!ENTER\t[]\n!EXIT\t[]' >> $modelDir/triphoneDict.dic");
ecsystem("cat $modelDir/triPhones.txt $modelDir/phonebet.txt | sort -u > $modelDir/allHmms.txt");

_3_6:
print "** STAGE 3_6: HEResting the HMM ...\n";
herest("HERest -T 1 -A -D -S $modelDir/dict.scp -I $modelDir/dictTriPhones.mlf $modelDir/triPhones.txt","$modelDir/triphone3.hdf", "$modelDir/triphone4.hdf", .001, $modelDir);

print "***STAGE 3:  done.\n";


__END__

=head1 NAME

train.pl - build a spelling to phonetic transcription model

=head1 SYNOPSIS

train.pl [options] 

 Required:
   -d,	--dict=<fname>		dictionary file
   -c,	--configDir=<fname>	dir with configuration files
   -m,	--modelDir=<dir>	where to save the model 
 Options:
   -s,	--startStage=i		begin at stage i instead of at beginning
   -h,	--help			brief help message

=head1 OPTIONS

=over 4

=item B<-help>

Prints this help message and exits.


=back

=head1 DESCRIPTION


=cut
