#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Pod::Usage;
use Data::Dumper;
use Text::WagnerFischer qw(distance);
use Util;
use Carp;

####################################################################
my $modelDir;
my $queryDir;
my $verbose=0;

{
    my $dataRootDir;
    my $dictFile;
    my $queryFile;
    my $outputFile;
    my $decodeDebug=1; #call HVite if necessary - set to false to speed up debugging
    my $forceDecode=0; #force decoding always.
    my $alignment=1; #do a forced alignment if possible. If false, always align with the decoded phones
    my $startStage=1;
    my $modelType='triphone';
    my $help;
    
    GetOptions (
                "modelDir|m=s"  => \$modelDir,
                "dict|d=s"  => \$dictFile,
                "queryDir|q=s" => \$queryDir,
                "queryFile|f=s" => \$queryFile,
                "outputFile|o=s" => \$outputFile,
                "verbose|v=i" => \$verbose,
                "modelType|t=s" => \$modelType,
                "decode!" => \$decodeDebug,
                "alignment!" => \$alignment,
                "forceDecode" => \$forceDecode,
                "help|h" => \$help) 
    or pod2usage(2);
    
    pod2usage(-exitstatus => 2, -verbose => 3) if $help;
    
    pod2usage(2) if !($modelDir and $queryDir and $queryFile);
    
    confess("cannot open query file $queryFile")  if(! -e "$queryFile");
    die "cannot access config dir $modelDir"  if(! -e "$modelDir");
    die "cannot access dictionary $dictFile"  if(defined($dictFile) and ! -e "$dictFile");
    if (! -e $queryDir){
        ecsystem("mkdir $queryDir");
    }
    
    #prepare the queries
    my @queries, my %queries;  		#can contain partial words in context using [] 
    #my %modelQueries; 			#the full word passed on to HTK as the key 
    
    
    open(QUERY, "<$queryFile") ||  die "$!: cannot open <$queryFile";
    while(<QUERY>){
        chomp;
		next if (/^$/);
        $_ = uc;
        s/^([^ ]*) .*$/$1/;	#so we can use dictionary files for queries
        s/\([0-9]+\)$//;	#get rid of alternate definition markers e.g. ACCEPT(2) 

        my $qu = $_;
        push @queries, $qu;
        $queries{$qu}{'query'}=$qu;

		#The dictionary consists of a-zA-Z_'
		#Any word with numbers and underscores will be transliterated as an acronym, and not passed to HTK
		#The - is a hack, always translating to an empty sequence of phones.

		if  (!(m/[^a-zA-Z_'0-9\[\]\-]/ || /^-*$/)){ #FIXME make the dictionary, training and file-naming alphabets to be consistant somehow
	        s/[\[\]-]//g;	#get rid of [] brakets and punctuation.  This will be passed to HVite, if decode/align is needed #FIXME the [-i]on test case
			$queries{$qu}{'queryWord'}=$_;
		} 

    }
    close QUERY;
    
    
    
    #load up the dictionary
    (my $dict, my $dictMlfName) = loadDict($dictFile, $modelType);
    
    #the numbers are pronounced in english - everything else is language independent
    doEasyModels($dict, \%queries);
    
    #decide how to predict phones
    choosePredictMethod(\%queries, $dict);

    my (%decode, %align);
	foreach (keys %queries){
		if($queries{$_}{'bestRes'} eq 'alignRes' && !$alignment){ #decoding is forced even though aligment may be possible
			$queries{$_}{'bestRes'}='recRes';
		}
	
		if($queries{$_}{'queryWord'}){
			if($queries{$_}{'bestRes'} eq 'alignRes'){
				$align{$queries{$_}{'queryWord'}}=0;
			}
			if($queries{$_}{'bestRes'} eq 'recRes' || $forceDecode){
				$decode{$queries{$_}{'queryWord'}}=0;
			}
		}
	}


    #if necessary do decoding and or alignment.  Don't do it if decodeDebug has been set to 0.
    my $predictedMlfFname = "$queryDir/predicted.mlf";
    my $predictedAlignmentMlfFname = "$queryDir/predictedAlignment.mlf";
    
    if((keys %decode) > 0){
        prepObs("$queryDir/query.scp", [keys %decode]);
        if($decodeDebug){
            if ($modelType eq 'monophone'){
                ecsystem("HVite -A -D -H $modelDir/monophone1.hdf -w $modelDir/biphone.slf -i $predictedMlfFname $modelDir/phoneDict.dic $modelDir/phonebet.txt -S $queryDir/query.scp 1>&2");
            }
            elsif($modelType eq 'triphone'){
                ecsystem("HVite -A -D -t 20 -H $modelDir/triphone4.hdf -H $modelDir/monophone1.hdf -w $modelDir/bigramtriphone.slf -i $predictedMlfFname $modelDir/triphoneDict.dic $modelDir/allHmms.txt -S $queryDir/query.scp 1>&2");
            }
        }
        readModelResults($predictedMlfFname,\%decode);
    
    }
    if((keys %align) > 0){
        prepObs("$queryDir/align.scp",[keys %align]) ;
        if($decodeDebug){
            if ($modelType eq 'monophone'){
                ecsystem("HVite -A -D -H $modelDir/monophone1.hdf -a -I $dictMlfName  -i $predictedAlignmentMlfFname $modelDir/phoneDict.dic $modelDir/phonebet.txt -S $queryDir/align.scp 1>&2");
            }
            elsif($modelType eq 'triphone'){
                ecsystem("HVite -A -D -t 20 -H $modelDir/triphone4.hdf -H $modelDir/monophone1.hdf -a -I $dictMlfName  -i $predictedAlignmentMlfFname $modelDir/triphoneDict.dic $modelDir/allHmms.txt -S $queryDir/align.scp 1>&2");
            }
        }
        readModelResults($predictedAlignmentMlfFname,\%align);
    }
    
	foreach (keys %queries){
		if($queries{$_}{'queryWord'}){
			my $qw=$queries{$_}{'queryWord'};

			if($align{$qw}){
				$queries{$_}{'alignRes'}=$align{$qw};
			}

			if($decode{$qw}){
				$queries{$_}{'recRes'}=$decode{$qw};
			}
		}
	}

    
    #do edit distance to true dict definition, if possible
    my $stats;
    if($verbose>=2){
        $stats = evalQuality(\%queries);
    }
    
    #use the models to answer original queries with possibly partial words
    answerQuery(\%queries);
    
    
    
    #print out the results
    my $outf;
    if ($outputFile){
        open($outf, ">$outputFile") || confess("cannot open >$outputFile");
    }
    else{
        $outf = *STDOUT;
    }
    
	my $badQueries="";
	foreach my $query (@queries){
		if ($queries{$query}->{'badRes'}){
			$badQueries .= "$query  $queries{$query}->{'badRes'}\n";
		}
	}

    if($verbose==0){
        foreach my $query (@queries){
            my	$bestResStr = join (' ', @{$queries{$query}->{'result'}});
            print $outf "$query  $bestResStr\n";
        }
        print $outf "\nErrors:\n$badQueries" if $badQueries;
    }
    if($verbose == 1){
        my @reslist=();
        foreach my $query (@queries){
            push @reslist, $queries{$query};
        }
        print $outf Dumper(\@reslist);
    }
    if($verbose>=2){
        print $outf Dumper($stats);
        my $wordAcc = $stats->{'matchingWordCount'}/$stats->{'totalWordCount'};
        my $meanPhoneEditDist = $stats->{'totalEditDistance'}/$stats->{'phoneCount'};
        print $outf "word accuracy rate: $wordAcc\n";
        print $outf "mean phone edit distance: $meanPhoneEditDist\n";
        print $outf "\nErrors:\n$badQueries" if $badQueries;
    }
    if($verbose==3){
            foreach my $query (keys %queries){
                my $details = $queries{$query};
                if($details->{'editDist'}){
					#print $outf Dumper($details->{'bestRes'});
                    print $outf "$query ($details->{'editDist'}): $details->{'dictRes'} == $details->{'editDistPhoneStr'}\n";
            }
    
        }
    }
    
    

    exit;
}



#######subroutines########

#load the provided dictionary and create the MLF file for alignment if specified, otherwise use the dictionary that came with the model
#strip the leading and trailing special "wb" phones in the values dict{$word} .
sub loadDict{
	my $dictFile = shift;
	my $modelType = shift;	
	my	$dictMlfName;
	my %dict=();
	if ($dictFile){
		ecsystem("perl -w -S dictToHTK.pl -d $dictFile -p forAlignment -t $queryDir");
		if($modelType eq 'triphone'){
			ecsystem("HLEd -T 2 -A -l $queryDir/words -i $queryDir/dictTriPhones.mlf $modelDir/cfg/mktri.led $queryDir/dict.mlf");		
			$dictMlfName = "$queryDir/dictTriPhones.mlf";
		}
		elsif($modelType eq 'monophone'){
			$dictMlfName = "$queryDir/dict.mlf";
		}
	}
	else{ #use the model's dictionary if one wasn't povided
		$dictFile = "$modelDir/cleanDict.txt";
		if($modelType eq 'triphone'){
			$dictMlfName = "$modelDir/dictTriPhones.mlf";
		}
		elsif($modelType eq 'monophone'){
			$dictMlfName = "$modelDir/dict.mlf";
		}

	}

	open(DICT, "<$dictFile") ||  die "$!: cannot open <$dictFile";
	while(<DICT>){
		chomp;
		(my $word, my $phones) = split(' ',$_,2);
		$word =~ s/\([0-9]+\)$//;	#get rid of alternate definition markers e.g. accept(2) 
		$phones =~ s/[0-9]//g;		#get rid of phone accents
        $phones =~ s/wb//g;        #get rid of the leading and trailing special " wb" phones
        chomp($phones);
		$dict{$word}=$phones if (!defined($dict{$word}));
	}

	if($verbose){
		print scalar(keys %dict) . "words loaded.\n";
	}

	close DICT;
	
	return(\%dict, $dictMlfName);

}

#figure out the models for acronyms and axact dictionary matches
#the numbers are pronounced in english - everything else is language independent
sub doEasyModels{
    my ($dict, $queries) = @_;
    
    
    #get the dictionary and translit results if possible
	#FIXME translit is lanugage specific and possibly wrong
    foreach (keys %$queries){
    	my $dictPhones = $dict->{$_};
    	if ($dictPhones){
		  my @dictPhonesList = split(' ',$dictPhones);
		  $queries->{$_}{'dictRes'}= join(' ', @dictPhonesList);
        }
        if(/[0-9]/ || /[[:alnum:]]_+[[:alnum:]]('.?)?/){
			if($queries->{$_}{'queryWord'}){
				my ($error,$phoneSeqs) = pronounceAcronymOrNumber($queries->{$_}{'queryWord'},$dict);
				if($error){
					$queries->{$_}{'badRes'}=$error;
				}
				else{
					$queries->{$_}{'translitRes'}=$phoneSeqs;
				}
			}
        }
    }
    #print Dumper( $modelQueries);

}

#determines the pronunciation of acronyms and numbers.
#This is english specific.
sub pronounceAcronymOrNumber{
    my ($word,$dict) = @_;
    my @numbers=("OH","ONE","TWO","THREE","FOUR", "FIVE", "SIX", "SEVEN", "EIGHT", "NINE", );

	my @chars;
	if(/[0-9]/){
		@chars = split(//,$word);
	}
	else{
		@chars = split(/_/,$word);
	}
	@chars = map ( /[0-9]/ ? $numbers[$_] : $_, @chars);
	#print "@chars\n";
	my @phones = @$dict{@chars};
		#print "$_\n";
	my $error="";
	my @phoneSeqs=();
	foreach my $ph (@phones){
		if ($ph){
			push(@phoneSeqs, [split(' ', $ph)]);
		}
		else{
			my @phonesSoFar=grep($_,@phones);
			$error .= "translitRes failed on word $word: '@chars'  is translated as '@phonesSoFar'";
		}
	}
	return($error,\@phoneSeqs);

}
            
#decide the best result for a query (align and decode results will be obtained later if needed)
#the order of preference (best to worst) is:
#1)look it up in the dictionary
#2)if the word is an acronym (matches /([A-Z]_)*[A-Z]/) or contains numbers, do 
#        direct letter-to-phones replacement: 
#        "401K" becomes "F AO R OW W AH N K EY"
#        "I_B_M becomes "AY B IY EH M"
#3)do forced alignment against the word in the dictionary
#4)do decoding
#5)report an invalid query if everything else is impossible
#choices 1) and 2) are not possible if a word frament is requested (e.g. "BI[TE]")
#if 1) and 2) are chosen, they are performed immediately
#
sub choosePredictMethod{
    my ($queries, $dict) = @_;
    foreach (keys %$queries){
        if($queries->{$_}{'dictRes'} && not /[\[\]]/){ #in the dictionary and no alignment necessary
            $queries->{$_}{'bestRes'}='dictRes';
        }
        elsif ($queries->{$_}{'translitRes'}){ #acronyms and acronym fragments
            $queries->{$_}{'bestRes'}='translitRes';
        }
        elsif ($queries->{$_}{'badRes'}){#This should have been answered in by one of the above methods, but there was a mistake
            $queries->{$_}{'bestRes'}='badRes';
        }
        elsif (!$queries->{$_}{'queryWord'}){#For the following methods, we need a queryWord (ie the alphabet must be recognized by trained model)
			$queries->{$_}{'badRes'}="The query uses letters not in the recognized alphabet.";
            $queries->{$_}{'bestRes'}='badRes';
        }
        elsif($dict->{$queries->{$_}{'queryWord'}}){ #in the dictionary and alignment is requested, or dashes were present in query but without dashes, the word is in the dict
			#print "lookup of $_: $queries->{$_}{'queryWord'} , $dict->{$queries->{$_}{'queryWord'}}\n";
            $queries->{$_}{'bestRes'}='alignRes';
        }
        else{ #not in the dictionary - do decoding for both fragments and whole words
            $queries->{$_}{'bestRes'}='recRes';
        }
    }
    
    
}


#prepare observation files.  Here a precondition is that different observation files will be named differently.
sub prepObs{
    my ($scpFile, $wordList) = @_;
    open(Q, ">$scpFile.txt") ||  die "$!: cannot open >$scpFile.txt";
    foreach (@$wordList){
        print Q "$_\n";
    }
    close Q;
    ecsystem("perl -w -S dictToHTK.pl -d $scpFile.txt -p forQuery -t $queryDir -l $modelDir/letToCode.pld");
    
    rename "$queryDir/dict.scp","$scpFile";
}

#Handle the partial word cases (the [] brackets) and reinsert the dash if needed.
sub answerQuery{

	(my $queriesRef) = @_;
        #print Dumper($queriesRef);
    
	foreach my $query (keys %$queriesRef){

		my $bestResKind=$queriesRef->{$query}{'bestRes'};
		my $bestModelRes = $queriesRef->{$query}{$bestResKind};

		if (!$bestModelRes){
			$queriesRef->{$query}{'bestRes'}='badRes';
			$queriesRef->{$query}{'badRes'}= 'No pronunciation available';
			$bestResKind=$queriesRef->{$query}{'bestRes'};
			$bestModelRes=$queriesRef->{$query}{'badRes'};
		}

		if($bestResKind ne 'badRes'){

            if(ref($bestModelRes) eq 'ARRAY'){ 
				if ($queriesRef->{$query}{'query'} =~ /-/){
					$bestModelRes=[reinsertDash($queriesRef->{$query}{'query'}, @$bestModelRes)];
				}
                my @borders=(-1,0); #the last and first positions of accepted text, respectively
				my $qb = $query;
				if ($queriesRef->{$query}{'bestRes'} eq 'translitRes'){
					$qb =~ s/_//g;
				}
                if(	$qb =~ m/[\[]/g){
                    $borders[0]= (pos $qb)-2;
                    $qb =~ m/[\]]/g;
                    $borders[1]= (pos $qb)-2;
					#print Dumper($bestModelRes) . "@borders $qb\n";
                }
    
                #print "@borders\n";
                my @bestModelResList = @$bestModelRes;
                my @partialLetters=();
                @partialLetters = @bestModelResList[0..$borders[0]] if ($borders[0]>=0);
                my $lastLetter = @bestModelResList-1;
                push @partialLetters,  @bestModelResList[$borders[1]..$lastLetter] if ($borders[1]<=$lastLetter);
    
                my @partialPhones=();
                map {push @partialPhones, @$_} @partialLetters;
                @partialPhones = condense(@partialPhones);
                #print "@partialPhones\n";
                $queriesRef->{$query}{'result'}=\@partialPhones;
            }
            else{
                $queriesRef->{$query}{'result'}=[split(' ',$bestModelRes)];
            }

		}
		else{
			$queriesRef->{$query}{'result'}=[];
		}
	
	}
}

sub reinsertDash{
	my ($query, @letterList)=@_;
	$query =~ s/[\[\]_]//g;
	#print "*** $query\n";
	my $pos=0;
	my $ind=index($query,'-',$pos);
	while ($ind>=0){
		if ($ind > scalar(@letterList)) {
			print (scalar(@letterList) . " $query , $ind\n") ;
			print Dumper(@letterList);
		}
		splice(@letterList, $ind,0,[]);
		$ind=index($query,'-',$ind+1);
	}
	return @letterList;
}
#calc edit distance between dictionary phone seq and best model phone seq
sub evalQuality{
	my $queriesRef =shift;


	#make a phonebet to char map, so that we can do edit distance between phone strings.
	#all edit distance algs are written for strings, and I'm too lazy to write one for lists
	my $phoneCounter = ord('A');
	my %phonebet=();
	open(PHONEBET, "<$modelDir/phonebet.txt") ||  die "$!: cannot open <$modelDir/phonebet.txt";
	while(<PHONEBET>){
		chomp;
		$phonebet{$_} = chr($phoneCounter++);
	}
	close PHONEBET;

	my %stats=(	'phoneCount' => 0, 
				'totalEditDistance' => 0, 
				'totalWordCount' => 0,
				'matchingWordCount' => 0);


	foreach my $curQuery (values %$queriesRef){
		#print Dumper($curQuery);
		my $editDistance;
		if ($curQuery->{'dictRes'} && ($curQuery->{'alignRes'} || $curQuery->{'recRes'} )){ #do phone edit distance from dict definition to model definition
			my @dictPhoneSeq = split(' ',$curQuery->{'dictRes'});
			my $dictDistStr = join('',@phonebet{@dictPhoneSeq});

			my @m = map {@$_} $curQuery->{'alignRes'} ? $curQuery->{'alignRes'} : $curQuery->{'recRes'};

			my @bestPhoneList=  map {@$_} @m;
			@bestPhoneList = condense(@bestPhoneList);
	

			my $modelDistStr = join('',@phonebet{@bestPhoneList});
			$editDistance=distance($dictDistStr ,$modelDistStr);
			$curQuery->{'editDist'}=$editDistance;

			#may be different from the string in $query{word}{'result'} because 
			#result may have come from some other model besides recRes or alignRes
			$curQuery->{'editDistPhoneStr'}="@bestPhoneList";

			$stats{'phoneCount'} +=length($dictDistStr); 
			$stats{'totalWordCount'}++ ;
			$stats{'totalEditDistance'} += $editDistance;
			$stats{'matchingWordCount'} += ($editDistance == 0);
			
		}	

	}
	
	return \%stats;
	
}

#replaces subsequences of identical phones with a single phone
sub condense{
	my $lastPhone = "not bloody likely";
	my @condensed=();
	foreach (@_){
		next if ($_ eq $lastPhone);
		push @condensed, $_;
		$lastPhone = $_;
	}
	return @condensed;
}

#load in model results from a .mlf file that was created by HVite
sub readModelResults{
	my $mlfFname = shift; 
	my $queryWordRef =shift;
	my $curResultName;
	my @curModelResult;
	my @curLetter=(); #sometimes multiple phones get assigned to the same letter

	my $modelResultsFH;
	open ($modelResultsFH, "<$mlfFname") || confess( "$!: cannot open <$mlfFname");

	foreach(<$modelResultsFH>){
		chomp;
		next if(/^#/);
		if(/^\"/){ 			#"#new result begins
			m/.*\/(.*)\.rec"/;
			$curResultName=$1;
			next;
		}							
		if (/^\.$/){			#result ends
			pop @curModelResult; #get rid of the special wb phone at start and end
			shift @curModelResult;
			$queryWordRef->{$curResultName}=[@curModelResult];
			@curModelResult=();
			next;

		}
		(my $phoneStartPos, my $phoneEndPos, my $curPhone, my $dummy) = split;
		$curPhone =~ s/\+[A-Za-z_]+|[A-Za-z_]+-//g; #remove the context from triphones.
		if($phoneStartPos == $phoneEndPos){#arbitrarily assign letterless phones to the next letter
			push @curLetter, $curPhone;
		}
		else{
			for (my $i=$phoneStartPos; $i<$phoneEndPos; $i++){
				push @curLetter, $curPhone;
				push @curModelResult, [@curLetter];
				@curLetter=();
			}
		}
	}
	return;	

	close($modelResultsFH);
}

=head1 NAME

letterToPhone.pl - use a spelling to phonetic transcription model to generate phonetic transcriptions for
orthographically spelled out words.

=head1 SYNOPSIS

letterToPhone.pl [options] 

 Required:
   -m,	--modelDir=<dir>	the model to use
   -f,   --queryFile=<fname>	file containg the query word list. can be -
   -o,   --outputFile=<fname> file to contain the results of the query.  If not specified, STDOUT is used
   -q,	--queryDir=<dir>   	temp dir for intermediate query files
 Options:
   -d,	--dict=<fname>		will look up the word in the dictionary first, before modelling
   -v,  --verbose=0|1|2|3		type of reporting
   -t,  --modelType=monophone|triphone	type of HMM model (default is triphone)
        --noalignment do not perform HVITE forced alignment pass 
        --nodecode do not perform any HVite decoding (debugging)
        --forceDecode decode the word even if it's found in the dictionary or can be spelled out
   -h,	--help			brief help message

=head1 OPTIONS

=over 4

=item B<--help>

Prints this help message and exits.

=item B<--verbose=0|1|2>
	
	 if 0 or omitted, print out the word with best pronounciation available

    if 1, print out all available info about a query.  This output can be read into PERL via EVAL statement
    
    if 2, print summaries of phone and word error rates against the dictionary definitions. The calculations are done on queries' corresponding full words.
	 
	 if 3, print out same as 2 and also all the mistakes

=back

=head1 DESCRIPTION

B<letterToPhone.pl> is useful either when a letter-to-phone alignment is required or 
when a phonetic transcription is not available in a dictionary.
This can happen when the word is misspelled, or when in speech transcriptions used for
speech recognizer training, the transcription shows that a word was only partially pronounced.
Using only the phones corresponding to the pronounced part of the word (might?) improve training quality.


Describe the dictionary format, model, query format, punctuation.
=cut
