#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Pod::Usage;
use Config::IniFiles;
use HtkIO;
use Util;
use Data::Dumper;
my $dictFile;
my $trainDir;
my $purpose='forTraining';
my $letToCodeFname;
my $noClobber=0;
my $help;

GetOptions (	#"dataRootDir|d=s" => \$dataRootDir,   
			"dict|d=s"  => \$dictFile,
			"htkTrainDir|t=s" => \$trainDir,
			"noClobber|n" => \$noClobber,
			"purpose|p=s" => \$purpose,
			"letterCodeMap|l=s" => \$letToCodeFname,
            "help|h" => \$help) 
or pod2usage(2);

pod2usage(-exitstatus => 2, -verbose => 3) if $help;

pod2usage(2) if !($dictFile and $trainDir);

#specify which files to create
my $doScp;
my $doMlf;
my $doHtkDict;
my $doObs;
my $doLetToCode;
my $doAlphabet;

#all file i/o operations are conditioned on these variables being true
if ($purpose eq 'forTraining'){
	 $doScp=1;
	 $doMlf=1;
	 $doHtkDict=1;
	 $doObs=1;
	 $doLetToCode=1;
	 $doAlphabet=1;
}
elsif ($purpose eq 'forQuery'){
	 $doScp=1;
	 $doMlf=0;
	 $doHtkDict=0;
	 $doObs=1;
	 $doLetToCode=0;
	 $doAlphabet=0;
}
elsif($purpose eq 'forAlignment'){
	 $doScp=0;
	 $doMlf=1;		#the mlf file in this case has different label names!
	 $doHtkDict=0;
	 $doObs=0;
	 $doLetToCode=0;
	 $doAlphabet=0;
}
else{
	die 'invalid purpose.  See usage.';
}

if (!$letToCodeFname){
	$letToCodeFname="$trainDir/letToCode.pld";
}


$|=1; #flush after every print


die "cannot access dictionary $dictFile"  if(! -e "$dictFile");
die "letter To Code file $letToCodeFname does not exist." if($purpose eq 'forQuery' && ! -e $letToCodeFname);

if(! -e "$trainDir"){
	ecsystem("mkdir $trainDir");
}

if(! -e "$trainDir/words"){
	ecsystem("mkdir $trainDir/words");
}



my %letToCode;
my $codeCounter;
if(-e $letToCodeFname){
	%letToCode = %{loadDumper($letToCodeFname)};
	$codeCounter=scalar(keys %letToCode);
}
else{
	%letToCode=();
	$codeCounter=1; #htk doesn't handle discrete codebook value 0
}

open(DICT, "<$dictFile") ||  die "$!: cannot open <$dictFile";
open(SCP, ">$trainDir/dict.scp") ||  die "$!: cannot open >$trainDir/dict.scp" if ($doScp);
open(HTKDICT, ">$trainDir/cleanDict.txt") ||  die "$!: cannot open >$trainDir/cleanDict.txt" if ($doHtkDict);
open(MLF, ">$trainDir/dict.mlf") ||  die "$!: cannot open >$trainDir/dict.mlf" if ($doMlf);
print MLF "#!MLF!#\n" if ($doMlf);
while(<DICT>){
	print STDERR '.' if ( !($. % 1000));
	chomp;
	(my $obsName, my $phones) = split(' ',$_,2);
	#$obsName =~ s/^[^a-zA-Z']+/_/; #replace first character punctuation with _'
	$obsName =~ s/\(([0-9]*)\)/+$1/g;
	$obsName =~ s/\+[0-9]*//g if($purpose eq 'forAlignment');
	$obsName =~ s/[^a-zA-Z_'0-9+]+//g; #allow only the alphabet chars in name and the 
	my $letterStr = $obsName;
	$obsName =~ s/^CON$/_CON/;	#you are not allowed to create a file named CON or AUX under windows. amazing.
	$obsName =~ s/^AUX$/_AUX/;
	$letterStr =~ s/[^a-zA-Z_']+//g; #remove non-alphabet chars

	my @codeSeq=();
	print SCP "$trainDir/words/$obsName.dsc\n" if ($doScp);
	if ($purpose ne 'forQuery'){
		print MLF "\"*/$obsName.lab\"\n" if ($doMlf);
		$phones =~ s/[0-9]//g; #get rid of accent information
		print MLF join("\n", 'wb', split(' ', $phones), 'wb', ".\n") if ($doMlf); #wb is a special 'word boundary' phone
		print HTKDICT "$letterStr wb $phones wb\n" if ($doHtkDict);
	}
	my @letters = split(//,$letterStr);
	push @letters, '_WB_';	#_WB_ is a special word  boundary letter
	unshift @letters, '_WB_';
	@letters = map{s/(['])/\_APO_/; $_} @letters; #'
	foreach (@letters){
		if (!defined($letToCode{$_})){
			if ($purpose eq 'forQuery'){
				die "$_ is not in the recognizable alphabet.";
			}
			else{
				$letToCode{$_}=$codeCounter++;
			}
		}
		push @codeSeq, $letToCode{$_};
	}
	if(!$noClobber  || ! -e "$trainDir/words/$obsName.dsc"){	#
		open(OBS,">$trainDir/words/$obsName.dsc") || die ("cannot open >$trainDir/words/$obsName.dsc\n$?") if ($doObs);
		my $packedObs = packHtk(10, 1, 1, @codeSeq);
		print OBS $packedObs if ($doObs);
		#my @data = unpackHtk($packedObs);
		close OBS if ($doObs);
	}
	#print "$obsName\t$letterStr\t@codeSeq\t@phones\n";

}
print STDERR "\n";
close MLF if ($doMlf);
close HTKDICT if ($doHtkDict);
close SCP if ($doScp);
close DICT;

open LETTOCODE, ">$letToCodeFname" || die ("cannot open >$letToCodeFname") if ($doLetToCode);
print LETTOCODE Dumper(\%letToCode) if ($doLetToCode);
close LETTOCODE if ($doLetToCode);

open ALPHABET, ">$trainDir/alphabet.txt" || die ("cannot open >$trainDir/alphabet.txt") if ($doAlphabet);
print ALPHABET join("\n",(sort keys %letToCode), '') if ($doAlphabet);
close ALPHABET if ($doAlphabet);

exit 0;

__END__


=head1 NAME

dictToHTK.pl - convert a dictionary to HTK training data for letter to phone function

=head1 SYNOPSIS

dictToHTK.pl [options] 

  Required:
   -d,	--dict=<fname>		dictionary file
   -t,	--htkTrainDir=<dir>	where to save the training data 
  Options:
   -p,  --purpose=forTraining|forQuery|forAlignment		specifies which files are created, default: forTraining
   -l,  --letterCodeMap=<fname> load the letter=>codemap from specified file.	
   -n,  --noClobber		Do not reprocess words for which files already exist
   -h,	--help			brief help message

=head1 OPTIONS

=over 4

=item B<--help>

Prints this help message and exits.

=item B<--purpose >
legal values are forQuery, forAlignment, forTraining.

if forTraining:
create all files (mlf, observations, htk script file, alphabet, letter2code, and htk dictionary)

if forQuery:

Assume the dictionary has no definitions and do not create alphabet.txt cleanDict.txt and dict.mlf.  
The letter=>code map file must exist.
With this option, if a letter is encountered that is not in the letter=>code map, an error is thrown, and 
the letter=>code map file is not modified.

if forAlignment:
create only the dict.mlf file.

=item B<--letterCodeMap>

Load the letter=>code map from specified file. If omitted, uses "<htkTrainDir>/letToCode.pld"
If necessary, the file will be created and augmented with any new letters and saved
unless --forQuery is specified.

=back

=head1 DESCRIPTION

B<dictToHTK.pl> will read a dictionary (in CMU format) and create a set of observation sequences
and labels suitable for training a discrete HMM model with HTK.  Within the sequence each observation
represents the letter in the spelling of the word, and the corresponding label is that letter.
The output directory will contain a file for each word definition, so there can be 10s of thousands 
files as the output of this program.


=cut
