#!/usr/bin/perl

################################################################################
#
# File: HVite.pl
# A perl script for parellel processing of the command 'HVite' from HTK
# 
# Usage:
#	HVite.pl [options] VocabFile HMMList DataFiles...
#
# For details about the usage, check it by typing "HVite" without any options
#	
# This perl script is designed to run transparently, e.g., you can run this
#   script as if you run 'HVite'.
#
# This script submits parellel jobs through the SGE (Sun Grid Engine) using
#   an SGE command 'qsub' and checks the job progress using 'qstat'
# 
# It returns when all the parellel jobs are finished
#
# If the script size is not large enough, then this script submits a single job
# In this case, this script does not check for job completion so that the main
#   routine can send multiple jobs to the cluster. So you need to check the job
#   completion in the main routine if necessary.
#
# Written by Bowon Lee, 02/22/2006
#
# Department of the Electrical and Computer Engineering
# University of Illinois at Urbana-Champaign
#
################################################################################

# Specify the number of processors
$NP = 32;	# Number of processors

# Specify the command to be executed
$COMMAND = "HVite";

# Check my user ID
$USERID = readpipe("whoami");

# Check for the input script file following the option '-S'
#   and output label file following the option '-i'
@ARGIN = @ARGV;
foreach $n (0..$#ARGIN)  {
	$NSCP = $n+1 if($ARGIN[$n] eq "-S");
	$NMLF = $n+1 if($ARGIN[$n] eq "-i");
}
$scpi = "$ARGIN[$NSCP]";
$mlfi = "$ARGIN[$NMLF]";

# Open the input script and compute the script size for each processor
open(SCP,"$scpi") || die "Cannot open $scpi: $!";
$NLINES = 0;
foreach (<SCP>) { $NLINES += 1; }
$SCPSIZE = int($NLINES/$NP);
close(SCP);

# When the script size for each processor is larger than 32,
if($SCPSIZE >= 32) {

    # Create a list of divided data
    @scpn = ();
    foreach $n (0..$NP-1)  {
	$scpn[$n] = "$scpi";
	$mlfn[$n] = "$mlfi";
	$scpn[$n] =~ s/(.*)(\..*)/\1\_$n\2/g;
	$mlfn[$n] =~ s/(.*)(\..*)/\1\_$n\2/g;
    }
    # Divide the data and write them into each script file
    open(SCP,"$scpi") || die "Cannot open $scpi: $!";
    $n = 0;
    $nlines = 0;
    foreach $line (<SCP>) {
	if( ($nlines == $SCPSIZE * $n) && ($n != $NP ) ) {	
		close(SCPPL);
		open(SCPPL, ">$scpn[$n]");
		$n = $n + 1;
	}
	print SCPPL "$line";
	$nlines += 1;
    }
    close(SCPPL);

    # Create command for each processor
    @commands = ();
    foreach $n (0..$NP-1)  {
	$commands[$n] = "$COMMAND";
	foreach $narg (0..$#ARGIN) {
	    unless($narg == $NSCP || $narg == $NMLF) {
		if($ARGIN[$narg] =~ m/\*/)
		{ $commands[$n] = "$commands[$n] '$ARGIN[$narg]'"; }
		else
		{ $commands[$n] = "$commands[$n] $ARGIN[$narg]"; }
	    }
	$commands[$n] = "$commands[$n] $scpn[$n]" if($narg == $NSCP);
	$commands[$n] = "$commands[$n] $mlfn[$n]" if($narg == $NMLF);
	}
    }

    # Write script for each processor
    foreach $n (0..$NP-1)  {
	$scps = "$COMMAND\_$n.sh";
	open(SGESCP,">$scps") || die "Cannot open $scps: $!";
	print SGESCP '#!/bin/bash';
	print SGESCP "\n";
	print SGESCP '#$ -S /bin/bash';
	print SGESCP "\n";
	print SGESCP '#$ -cwd';
	print SGESCP "\n";
	print SGESCP "\n";
	print SGESCP "$commands[$n]\n";
	system("qsub $scps");
    }

    # Wait until all the jobs are completed
    $wait = 1;
    while($wait) {
	sleep 30;
	print "Checking job progress: ";
	@jobs = readpipe("qstat -u $USERID");
	$nproc =  0;
	foreach $job (@jobs) {
    	    $nproc += 1 if ($job =~ /$COMMAND/);
	}
	if($nproc == 0) { $wait = 0; next; }
	print "$nproc jobs are still running...\n";
    }
    print "Done\n";

    # Check any errors
    print "Checking any errors: ";
    @errors = readpipe("cat $COMMAND*.sh.e*");
    $errorcheck = $#errors + 1;
    if($errorcheck) {
	system("cat $COMMAND*.sh.e* > $COMMAND\_errors");
    }		

    # Merge the results
    print "Merging results: ";
    open(MLF,">$mlfi") || die "Cannot open $mlfi: $!";
    print MLF '#!MLF!#';
    print MLF "\n";
    foreach $n (0..$NP-1)  {
	open(MLFI,"$mlfn[$n]") || die "Cannot open $mlfn[$n]: $!";
	my @lines = <MLFI>; 
	foreach $n (1..$#lines) { print MLF "$lines[$n]"; }
	close(MLFI);
    }
    close(MLF);
    print ": Completed!\n";

    # Clean temporary files
    print "Cleaning temporary files: ";
    foreach $n (0..$NP-1)  {
	system("rm -f $scpn[$n]");
	system("rm -f $mlfn[$n]");
	system("rm -f $COMMAND\_$n.sh*");
    }		
    print "Done\n";

    # If error occurred, then print this message
    if($errorcheck) {
	print STDERR "Error occured: Please check $COMMAND\_errors\n";
    }

}  # if($SCPSIZE >= 32)  {

# When the script size is not large enough, then submit a single job
else  {
    print "Script size is too small for parellel processing: submitting a single job\n";
	$command = "HVite";
	foreach $narg (0..$#ARGIN) { 	
		if($ARGIN[$narg] =~ m/\*/)
		{ $command = "$command '$ARGIN[$narg]'"; }
		else
		{ $command = "$command $ARGIN[$narg]"; }
	}
	$scps = "HVite.sh";
	open(SGESCP,">$scps") || die "Cannot open $scps: $!";
	print SGESCP '#!/bin/bash';
	print SGESCP "\n";
	print SGESCP '#$ -S /bin/bash';
	print SGESCP "\n";
	print SGESCP '#$ -cwd';
	print SGESCP "\n";
	print SGESCP "\n";
	print SGESCP "$command\n";
	system("qsub $scps");
}
