/* ------------------------------------------------------------------------- */
/*
  VExtract.c: Extract Vectors whose transcription matches patterns 

  Copyright 2005, Trustees of the University of Illinois

  Licensed under the Apache License, Version 2.0 (the "License").
 You may not use this file except in compliance with the License. You may obtain 
 a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
 by applicable law or agreed to in writing, software distributed under the License 
 is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
 either express or implied. See the License for the specific language governing 
 permissions and limitations under the License.

 Revision history:
   August 2004, Mark Hasegawa-Johnson: First revision created

 Likely future revisions:
   PVTK should be cleaned up, made independent of HTK, merged with SpeechLib,
   and rewritten from the bottom up to be compatible with LAPACK and STL.

/* ------------------------------------------------------------------------- */

char *vextract_version = "$Id$";

#include "PVTK.h"
#include <regex.h>
#define MAX_PATTERNS 200

/* ---------------------- Global Variables ----------------------- */
static int  trace  = 0;         /* Trace level */
static ConfParam *configs[MAXGLOBS];
static int nParm = 0;           /* total num params */
static BufferInfo info;         
static MemHeap mStack;          /* matrix stack */
static MemHeap xfStack;         /* Transform stack -- not reset in every loop*/

/* ---------------- Usage Description ------------------------- */

static char *USAGE="\n\
USAGE: VExtract [opts] src1 ...\n\
\n\
 Suggested usage:\n\
\n\
 (1) Create listings of the training and testing files:\n\
\n\
 ls traindata/*.htk > train.scp;\n\
 ls testdata/*.htk > test.scp;\n\
\n\
 (2) Suppose that MLF lines matching regular expression REG1 correspond to\n\
     frames of class +1, while MLF lines matching REG2 correspond to -1.\n\
     Assume that file MLF.mlf is an HTK-format master label file.  \n\
     Suppose that you want at most 5000 tokens of each class, but that\n\
     you want to make sure to end up with the same number of tokens in each\n\
     class, even if it means getting fewer than 10000 tokens.\n\
     Compute the statistics, and extract the training tokens, using \n\
\n\
 VExtract -T 1 -c /+1/REG1/ -c /-1/REG2/ -m 5000,corpus,locked\n\
   -f train.stats -o train.toks -I MLF.mlf -S train.scp\n\
\n\
 (3) Finally, extract the test tokens, but normalize using\n\
     normalizing statistics from the training tokens:\n\
\n\
 VExtract -T 1 -m 2000,corpus,locked -c /+1/REG1/ -c /-1/REG2/ \n\
   -g train.stats -o test.toks -I MLF.mlf -S test.scp\n\
\n\
 If a line matches regular expression, then all frames between the start\n\
 time and end time specified on that line will be candidates for extraction.\n\
 Regular expression syntax is specified in man 7 regex.  Note, in \n\
 particular,  that word boundaries may be specified by the symbols [[:<:]] \n\
 and [[:>:]] . The actual number of frames extracted from the corpus is \n\
 specified by the -m option. \n\
\n\
 Tokens are taken from uniformly spaced positions spanning each file.\n\
 If -T 1 is specified, VExtract will print (to stderr) a specification\n\
 of the frame number and filename from which each token vector was\n\
 extracted.\n\
\n\
 The MLF read by VExtract is not as flexible as a usual HTK MLF.\n\
 Lines starting with a quotation mark are read as filenames.  Lines\n\
 starting with a digit are read as segment descriptors (start end\n\
 label).  Other lines are ignored.\n\
\n\
 If -f is specified, a statsfile will be generated, and all output\n\
 tokens will be normalized by the rule (x-mu)/sd.  If -g is specified,\n\
 the same type of normalization is applied, but mu and sd are read\n\
 from the first two lines of the specified statsfile, rather than computed.\n\
 If -g and -f are both specified, -g takes precedence, but a new statsfile is\n\
 also written. statsfile is in svmlight format.  First three lines are mean, SD,\n\
 and number of tokens in each dimension.  If -h is specifed, the next\n\
 lines are labeled by the histogram threshold, with values equal to\n\
 the histogram count.  Histogram bins contain the number of tokens in each\n\
 bin AFTER normalization, thus it is usually reasonable to specify thresholds\n\
 in the range of roughly -2:0.2:2 (meaning -2 standard deviations, up to +2 sd).\n\
 Following the global stats, stats are given separately for each class.\n\
\n\
 Option                                                     \n\
\n\
 -a outfile       Append toks to outfile in svmlight format\n\
 -c /LABEL/REGEX/\n\
                  When a line in the transcription matches REGEX,\n\
                  every frame between the start time and end time of\n\
                  that line is a candidate for extraction to class LABEL.\n\
 -f statsfile     Print mean and sd of extracted toks to statsfile\n\
 -g statsfile     Normalize outputs using stats in statsfile\n\
 -h /th1,th2,th3/ Compute a histogram with given thresholds\n\
 -h /b:s:e/       Compute a histogram with thresholds b,b+s,b+2s,...,e\n\
 -m NUM,SOURCE,LOCKTYPE\n\
                  Read at most NUM vectors from each SOURCE.\n\
                  NUM is an integer, SOURCE is either `corpus' or `file.'\n\
                  LOCKTYPE is either `unlocked' (meaning that VExtract\n\
                  should extract as many vectors per class as possible, up to\n\
                  a maximum of NUM per SOURCE), or `locked' (meaning that\n\
                  VExtract should extract exactly the same number of vectors\n\
                  per class from each SOURCE, even if that number is fewer\n\
                  than NUM).\n\
 -o outfile       Write toks to outfile in svmlight format\n\
 -t /t1,t2,t3,t4/ Concatenate frames t+t1,t+t2,t+t3,t+t4        \n\
 -t /b:s:e/       Concatenate frames t+b,t+b+s,t+b+2*s,...,t+e\n\
 -A               Print command line arguments\n\
 -I MLF           Read transcriptions from master label file MLF\n\
 -R               Print RCS version information\n\
 -S f             Use script file f\n\
 -T n             Set trace level to n (meaningful: 1,3,7,15,31)\n\
";

void ReportUsage(char *msg)
{
  printf(msg);
  printf(USAGE);
  printf("\n");
  Exit(0);
}

/* SetConfParms: set conf parms relevant to this tool */
void SetConfParms(void)
{
   int i;
   Boolean b;
   char buf[MAXSTRLEN];

   nParm = GetConfig("VEXTRACT", TRUE, configs, MAXGLOBS);
   if (nParm>0){
     if (GetConfInt(configs,nParm,"TRACE",&i)) trace = i;
   }
}

    
/******************************************************************/
/* ----------- main -------------- */

int main(int argc, char *argv[])
{
  extern char *optarg;
  extern int optind, opterr, optopt;
  char *s, *arg, *arg2;              /* command line option and arguments */
  char *filename;                    /* buffer for a filename */
  char fullpath[MAX_LINE];           /* static buffer for the full pathname */
  char *classlabels[MAX_CLASSES];    /* Labels for the classes */
  regex_t classregex[MAX_CLASSES];   /* Regular expressions that define classes */
  char *pat, *class;                 /* labels, and the patterns that define them */
  char buf[MAXSTRLEN];               /* string buffer */
  int nClasses=0;                    /* number of classes */
  FILE *outputfid;                   /* Output file */
  FILE *inputmlf=NULL;               /* File pointer for input MLF */
  int ***TokenTimes;                /* Frame numbers of input tokens for each file */
  int *TranscriptIndex;             /* Transcript index for each file in the script */
  int **nTokens;                    /* Number of tokens, per class, per input file */
  int *nTotal;                      /* Total number of tokens found per class*/
  int maxTotal=-1;                  /* Max number of tokens allowed per class */   
  int ifr,iTotal;                   /* Relative frame numbers to be concatenated */
  IntVec relfr;                     /* Frame indices to output, relative to row number of current output vector */
  int i,j,k,d,n,m,p,t;              /* Counters */
  int start_frame, end_frame;       /* Start frame and end frame, in frame skip units */
  int *numtoprint;                  /* Number of tokens of each class to print */
  int *num_needed;                  /* Number of tokens of each class that are needed */
  char *line;                       /* Input line read from the MLF, and its label field */
  int nMaxInput=-1;                 /* Max number of tokens per input file */
  int nTranscripts=0,iTranscript;   /* Num input filenms whose transcripts have been read*/
  char **TranscriptFilenames;       /* Storage for transcription filenames */
  int *nLabels, iLabel;             /* Storage for number of labels in each MLF */
  char ***TranscriptLabels;         /* Storage for label strings in each transcript */
  int **TranscriptStartTimes;       /* Start times */
  int **TranscriptEndTimes;         /* End times */
  char *statsoutfile=NULL;          /* Output file for summary statistics */
  FILE *fid=NULL;                   /* Miscellaneous i/o file FID */
  int statsblocksize=3;             /* Statistics block size = 3 if there is no histogram */
  SVMDef Stats;                     /* Matrix for computed stats */
  SVMDef Norm;                      /* Matrix for normalizing stats */
  SVMDef Output;                    /* Matrix for output tokens */
  int usable_file=1;                /* Flag: is current file usable? */
  char locking_type='c';            /* Locking type: file, corpus, or none */
  char **FileList;                  /* List of files, read from the command line */
  int NumFiles,IFile;               /* number of files, and file counter */
  DVector HThresh=NULL;             /* Thresholds for histogram computation */
  double *sampling_period;          /* Sample one frame per sampling_period[iClass] */
  double *sampled_frame;            /* Most recent frame is sampled_frame[iClass]; modulo file length across files */
  HParm HTKDATA;                   /* Storage for data read from HTK HParm file */
  
  if(InitShell(argc,argv,vextract_version,"")<SUCCESS) HError(1000,"VExtract: InitShell failed");
  InitMem();   InitLabel();
   InitMath();  InitSigP();
   InitWave();  InitAudio();
   InitVQ();    InitModel();
   if(InitParm()<SUCCESS) HError(1000,"VExtract: InitParm failed");

   if (NumArgs() == 0) ReportUsage("");
   SetConfParms();

   /* Create memory */
   CreateHeap(&mStack, "MatStack",  MSTAK, 1, 0.0, 10000000, LONG_MAX);
   CreateHeap(&xfStack,"StringStack",MSTAK,1,0.0,  10000000, LONG_MAX);

   /* Initialize the arrays of labels and of patterns */
   numtoprint = (int *)calloc(MAX_CLASSES+1, sizeof(int));
   num_needed = (int *)calloc(MAX_CLASSES+1, sizeof(int));
   sampling_period = (double *)calloc(MAX_CLASSES+1, sizeof(double));
   sampled_frame = (double *)calloc(MAX_CLASSES+1, sizeof(double));

   /* Information about individual files */
   TokenTimes = (int ***) calloc(MAX_FILES, sizeof(int **));
   TranscriptIndex = (int *) calloc(MAX_FILES, sizeof(int));
   nTokens = (int **)calloc(MAX_FILES, sizeof(int *));

   /* Initialize things to deal with MLFs */
   TranscriptFilenames = (char **)calloc(MAX_FILES,sizeof(char *));
   nLabels = (int *)calloc(MAX_FILES, sizeof(int));
   TranscriptLabels = (char ***)calloc(MAX_FILES,sizeof(char **));
   TranscriptStartTimes = (int **)calloc(MAX_FILES,sizeof(int *));
   TranscriptEndTimes = (int **)calloc(MAX_FILES,sizeof(int *));

   /* Initialize the normalization and statistics transform structures */
   ZeroSVMDef(&Stats);
   ZeroSVMDef(&Norm);
   ZeroSVMDef(&Output);

   /* ----------- Process Options -------------- */
   while (NextArg() == SWITCHARG) {
     s = GetSwtArg();
     if (strlen(s)!=1) HError(1,"VExtract: Bad switch %s; must be single letter",s);
     switch(s[0]){
     case 'a':
       if (NextArg() != STRINGARG) 
	 ReportUsage("VExtract: Output filename expected after -a\n");
       if((outputfid=fopen((filename=GetStrArg()),"a"))==NULL) {
	 perror("VExtract");
	 HError(1,"VExtract: Unable to append to output file %s\n",filename);
       }
       if(trace & T_OPTS) fprintf(stderr,"VExtract: Will append output to %s\n",filename);
       break;
     case 'c':       /* Read in a Class Definition */
       /* Get the class label, and remove the / wrapper */
       if (NextArg() != STRINGARG) ReportUsage("VExtract: String expected after -c\n");
       class=remove_wrapper(CopyString(&xfStack, GetStrArg())); 

       /* If max number of classes has been exceeded, die with an error */
       if(nClasses >= MAX_CLASSES) {
	 snprintf(buf,MAXSTRLEN,"VExtract: max num classes exceeded: (%d, class %s)\n",
		  MAX_CLASSES, class);
         ReportUsage(buf);
       }

       /* Create a new class */
       classlabels[nClasses] = (char *)calloc(strlen(class)+1, sizeof(char));
       strcpy(classlabels[nClasses], class);

       /* Get the regular expression, compile it, and store it */
       if (NextArg() != STRINGARG) ReportUsage("VExtract: REGEX expected after -c\n");
       pat=remove_wrapper(CopyString(&xfStack, GetStrArg())); 
       if(i=regcomp(&(classregex[nClasses]), pat, REG_NOSUB)) {
	 /* If regular expression parser returned an error, print it out */
	 regerror(i, &(classregex[nClasses]), buf, MAXSTRLEN);
	 fprintf(stderr,
		 "VExtract: In parsing of regex %s, parser returned error message %s\n",
		 pat,buf);
	 Exit(-1);
       }
       
       /* Increment number of classes; Done */
       nClasses++;
       break;
     case 'f':
       if (NextArg() != STRINGARG) 
	 ReportUsage("VExtract: Output filename expected after -f\n");
       statsoutfile=CopyString(&xfStack,GetStrArg());
       if(trace & T_OPTS) 
	 fprintf(stderr,"VExtract: Will write statistics to %s\n",statsoutfile);
       break;
     case 'g':
       if (NextArg() != STRINGARG) 
	 ReportUsage("VExtract: Input filename expected after -g\n");
       filename=GetStrArg();
       ReadSVMFile(filename, &xfStack, &Norm, trace);
       if(NumDRows(Norm.SV) < 2) 
	 HError(1,"VExtract: -g option must be svmtoks file with at least two vectors\n");
       if(Norm.alpha[1] != 0 || Norm.alpha[2] != 0) {
	 fprintf(stderr,
		 "VExtract: labels on normalization offset and scale (%d and %d, rows 1 and 2 of %s) will be ignored\n",
		 Norm.alpha[1],Norm.alpha[2],filename);
       }
       if(trace & T_VERBOSE) {
	 fprintf(stderr, 
		 "Read normalization stats from %s, will normalize by (x-mu)/sd:\n  mu ",
		 filename);
	 for(n=1; n<=3 && n<=NumDCols(Norm.SV); n++) 
	   fprintf(stderr,"%d:%g ",n,Norm.SV[1][n]);
	 fprintf(stderr,"...\n  sd ");
	 for(n=1; n<=3 && n<=NumDCols(Norm.SV); n++) 
	   fprintf(stderr,"%d:%g ",n,Norm.SV[2][n]);
	 fprintf(stderr,"...\n");
       }
       break;
     case 'h':
       if (NextArg() != STRINGARG) 
	 ReportUsage("VExtract: Vector definition string expected after -h\n");
       HThresh = strtoDVector(&xfStack, GetStrArg());
       /* Add enough room to the stats block in order to store histogram in each class */
       statsblocksize = 4 + DVectorSize(HThresh);
       if (trace & T_OPTS) {
	 fprintf(stderr,"Histogram will be computed with %d thresholds: [%g",
		 DVectorSize(HThresh),HThresh[1]);
	 for(i=2;i<=DVectorSize(HThresh);i++) 
	   fprintf(stderr,",%g",HThresh[i]);
	 fprintf(stderr,"]\n");
       }
       break;
     case 'm':  /* Get information about the number, source, and locking type of toks */
       /* First, find out how many tokens are desired */
       if (NextArg() != INTARG) 
	 ReportUsage("VExtract: -m should be followed by an integer\n");
       maxTotal=GetChkedInt(0,10000,s);
       /* Second, find out if this number is desired once per file or per corpus */
       if (NextArg() != STRINGARG) 
	 ReportUsage("VExtract: -m should be followed by NUM SOURCE LOCKTYPE\n");
       pat=GetStrArg();
       /* If source is `file' or `FILE', make this the number per file, not per corpus */
       if(*pat=='f' || *pat=='F') {
	 nMaxInput=maxTotal;
	 maxTotal=-1;
       }
       /* Otherwise: if SOURCE argument is unrecognized, call an error message */
       else if(*pat != 'c' && *pat != 'C') {
	 snprintf(buf,MAXSTRLEN,
		  "SOURCE argument of -m must be `file' or `corpus,' not %s\n",pat);
	 ReportUsage(buf);
       }
       /* Third, find out whether or not to lock the corpus */
       if (NextArg() != STRINGARG) 
	 ReportUsage("VExtract: -m should be followed by NUM SOURCE LOCKTYPE\n");
       pat=GetStrArg();
       /* Determine locking_type */
       if(*pat=='l' || *pat=='l') {
	 if(nMaxInput >= 0) locking_type='f';
	 else locking_type='c';
       }
       else if(*pat=='u' || *pat=='U') locking_type='n';
       /* If locking_type not recognized, print an error */
       else {
	 snprintf(buf,MAXSTRLEN,
		  "LOCKTYPE argument of -m must be `locked' or `unlocked,' not %s\n",pat);
	 ReportUsage(buf);
       }
       /* Trace information */
       if (trace & T_OPTS) {
	 if(nMaxInput >= 0) 
	   fprintf(stderr,"Will output at most %d tokens/class/file,",nMaxInput);
	 else 
	   fprintf(stderr,"Will output at most %d tokens/class total,",maxTotal);
	 if(locking_type=='n')
	   fprintf(stderr," different classes may have different token counts\n");
	 else
	   fprintf(stderr," different classes will have the same token count\n");
       }
       break;
     case 'o':
       if (NextArg() != STRINGARG) 
	 ReportUsage("VExtract: Output filename expected after -o\n");
       if((outputfid=fopen((filename=GetStrArg()),"w"))==NULL) {
	 perror("VExtract");
	 HError(1,"VExtract: Unable to write to output file %s\n",filename);
       }
       if(trace & T_OPTS) fprintf(stderr,"VExtract: Will write output to %s\n",filename);
       break;
     case 't':  /* Specify relative frame numbers that will be concatenated */
       if (NextArg() != STRINGARG) 
	 ReportUsage("VExtract: -t should be followed by a vector of offsets\n");
       relfr = strtoIntVec(&xfStack, GetStrArg());
       if (trace & T_OPTS) {
	 fprintf(stderr,"Output lines will contain %d frames: [%d",IntVecSize(relfr),relfr[1]);
	 for(ifr=2;ifr<=IntVecSize(relfr);ifr++) fprintf(stderr,",%d",relfr[ifr]);
	 fprintf(stderr,"]\n");
       }
       break;
     case 'A':
       /* Repeat the arguments to stderr */
       for(m=0; m<=argc; m++) fprintf(stderr,"%s ",argv[m]);
       break;

     case 'I':
       if (NextArg() != STRINGARG)
	 ReportUsage("VExtract: Input MLF filename expected after -I\n");
       if((inputmlf=fopen((arg=GetStrArg()),"r"))==NULL) {
	 perror("VExtract");
	 HError(1,"VExtract: Unable to read from input MLF %s\n",arg);
       }

       /* Read the input file transcriptions */
       nTranscripts = read_MLF(inputmlf, &xfStack, nTranscripts,TranscriptFilenames,nLabels,TranscriptLabels,
			  TranscriptStartTimes,TranscriptEndTimes);
       if(trace & T_OPTS) fprintf(stderr,"Read %d file descriptions from %s\n",nTranscripts,arg);
       fclose(inputmlf);
       break;
     case 'R':
       fprintf(stderr,"%s\n",vextract_version);
       PrintRCSIdentifier(stderr);
       Exit(0);
     case 'T':
       trace = GetChkedInt(0,255,s);
       break;
     case 'V':
       fprintf(stderr,vextract_version);
       Exit(0);
     default:
       snprintf(buf,MAXSTRLEN,"VExtract: Unknown switch %s",s);
       ReportUsage(buf);
     }
   }
   
   /* Verify the defined classes */
   if (trace & T_TOP) for(i=0; i<nClasses; i++) 
       fprintf(stderr,"Class %d is %s\n",i,classlabels[i]);

   /* If relfr wasn't specified, specify a one-element relfr with content=0 */
   if(relfr==NULL) {
     relfr=CreateIntVec(&xfStack, 1);
     relfr[1]=0;
   }

   /* -----------------------------------------------------------*/
   /* ----------- Load up source file descriptions --------------*/
   if (NumArgs() < 1) ReportUsage("VTransform: Source file name expected");
   FileList = (char **)malloc(sizeof(char *)*MAX_FILES);
   NumFiles=0;
   while(NumArgs() > 0) {
     if (NextArg()!=STRINGARG) HError(1,"VExtract: expected only filenames after options are done");
     FileList[NumFiles] = CopyString(&xfStack,GetStrArg());
     NumFiles++;
   }

   /**************************************************************************/
   /* ----------- First time through the list of files:  */
   /* -------------Figure out how many files are in each class, */
   /* -------------Check to make sure that we have an open and working MLF */
   if (inputmlf==NULL) ReportUsage("VExtract: Input MLF must be specified");
   /* Allocate space to count the number of tokens per class */
   nTotal = (int *)calloc(nClasses,sizeof(int));
       
   for(IFile=0; IFile < NumFiles; IFile++) {
     if((fid=fopen(FileList[IFile],"r"))==NULL) {
       fprintf(stderr,"VExtract: unable to read from %s\n",FileList[IFile]);
       perror("VExtract");
     }
     else {
       /* Read header to find sampling period and number of rows, then close file */
       ReadHParmHeader(fid, &HTKDATA); 
       fclose(fid);

       /* Expand the filename to get rid of . and .. */
       realpath(FileList[IFile],fullpath);        
       /* Skip past the directory structure to the filename */
       filename=basename(fullpath);                 
       /* Snip off the extension, if any */   
       if((s=strchr(filename,'.'))!=NULL) *s='\0';  
       /* Look in TranscriptFilenames for the filename */
       for(iTranscript=0; iTranscript<nTranscripts && 
	     strcmp(TranscriptFilenames[iTranscript],filename)!=0; iTranscript++);

       /* Continue only if we were able to find a transcript for this file */
       if(iTranscript>=nTranscripts) 
	 fprintf(stderr,"VExtract: Unable to find root file %s in any MLF; skipping\n",
		 filename);
       else {
	 TranscriptIndex[IFile]=iTranscript;
	 if(trace & T_VERBOSE) 
	   fprintf(stderr,
		   "%s (%d'th input) and %s (%d'th transcript, %d lines) matched %s\n",
		   FileList[IFile],IFile,TranscriptFilenames[iTranscript],
		   iTranscript,nLabels[iTranscript],filename);

	 /* Allocate space for the token times: up to the total number of */
	 /* frames in the file, according to HTKDATA */
	 TokenTimes[IFile]=(int **)calloc(nClasses, sizeof(int *));
	 for(i=0; i<nClasses; i++) 
	   TokenTimes[IFile][i]=(int *)calloc(HTKDATA.nrows,sizeof(int));
	 
	 /* Number of tokens in each class in this file */
	 nTokens[IFile] = (int *)calloc(nClasses,sizeof(int));
	 
	 /* Check each input line, to determine of which classes it may be a member */
	 for(iLabel=0; iLabel<nLabels[iTranscript]; iLabel++) {
	   
	   /* Find the starting frame and ending frame of this line */
	   start_frame=(int)floor((double)TranscriptStartTimes[iTranscript][iLabel] / 
				  HTKDATA.sampPeriod);
	   end_frame=(int)floor((double)TranscriptEndTimes[iTranscript][iLabel] / 
				HTKDATA.sampPeriod);
	   line=TranscriptLabels[iTranscript][iLabel];
	   
	   /* For each class... */
	   for(i=0; i<nClasses; i++) {    
	     /* Check to see if the class REGEX matches this line */
	     if(regexec(&(classregex[i]), line, 0, NULL, 0) != REG_NOMATCH) {
	       /* If a match was found, */
	       for(t=start_frame; t<=end_frame; t++) {
		 /* add all corresponding times to the TokenTimes array */
		 TokenTimes[IFile][i][nTokens[IFile][i]]=(float)t;
		 /* .. and increment nTokens[IFile][i] and nTotal[IFile][i] */          
		 nTokens[IFile][i]++; nTotal[i]++;                          
	       }
	       if(trace & T_VERBOSE) 
		 fprintf(stderr,
			 "Found %d'th example of %s, %d'th in %s:(%d,%d)/(%d,%d):\n %s",
			 nTotal[i],classlabels[i],nTokens[IFile][i],FileList[IFile],
			 start_frame,end_frame,TranscriptStartTimes[iTranscript][iLabel],
			 TranscriptEndTimes[iTranscript][iLabel],line); 
	     }
	   }
	 }
       }
     }
   }

   /* Warn if any class has zero token-count */
   for(i=0; i<nClasses; i++) 
     if(nTotal[i]==0) 
       fprintf(stderr,"VExtract: zero tokens found for class %s\n",classlabels[i]);
     
   /**************************************************************************/
   /* Determine how many tokens to use, from each file, for each class */

   /* If maxTotal < 0, set maxTotal to the maximum of nTotal */
   if(maxTotal < 0)  for(i=0; i<nClasses; i++) if(maxTotal < nTotal[i]) maxTotal=nTotal[i];
   /* If locking_type == corpus, revise maxTotal down to minimum of nTotal */
   if(locking_type == 'c') for(i=0; i<nClasses; i++) 
     if(maxTotal > nTotal[i]) maxTotal=nTotal[i];
   if(maxTotal==0) {
     fprintf(stderr,
	     "VExtract: number of output tokens for all classes has been set to zero\n");
     Exit(0);
   }

   /* Set the sampling period for this class: either nTotal[i]/maxTotal, or 1.0 */
   for (i=0; i<nClasses; i++) {
     sampling_period[i] = (maxTotal>0 && maxTotal<nTotal[i]) ? 
       (double)nTotal[i] / maxTotal : 1;
     /* Down-sample at the half-frames */
     sampled_frame[i] = sampling_period[i]/2;     
   }

   /* Get inputs from each file */
   /* Reset nTotal, so we can use it to count the number actually selected */
   for(i=0; i<nClasses; i++) nTotal[i]=0;
   for(IFile=0; IFile<NumFiles; IFile++) {
     if(nTokens[IFile]!=NULL) {
       /* Initialize num_needed for this file to max(nTokens, nMaxInput) */
       for(i=0; i<nClasses; i++) 
	 num_needed[i] = (nMaxInput>0 && nMaxInput<nTokens[IFile][i]) ? 
	   nMaxInput : nTokens[IFile][i];
       /* If file locking is on, clip num_needed to the minimum of all classes */
       if(locking_type=='f')
	 for(m=LONG_MAX, i=0; i<nClasses; i++) 
	   if(num_needed[i]<m) m=num_needed[i];
       if(locking_type=='f') 
	 for(i=0; i<nClasses; i++) num_needed[i]=m;
       
       
       /* Check every class, from every file, to see whether it needs to be downsampled */
       for(i=0; i<nClasses; i++) {
	 if(trace & T_VERBOSE) {
	   fprintf(stderr,"--- Reading class %d from file %d (%s)\n",
		   i,IFile,FileList[IFile]);
	   fprintf(stderr,"    starting with frame %g (%d), sampling period %g\n",
		   sampled_frame[i],(int)sampled_frame[i], sampling_period[i]);
	   fprintf(stderr,"    (nTokens[i],nTotal[i]): (%d,%d)\n",
		   nTokens[IFile][i],nTotal[i]);
	 }
	 
	 /* If num_needed is less than the number in the file, */
	 /* change the sampling_period to match */
	 if(num_needed[i] < nTokens[IFile][i]) 
	   sampling_period[i] = (double)nTokens[IFile][i] / num_needed[i];
	 /* Grab token times once per sampling period, */
	 /* and move them down to start of the vector */
	 for(m=0; sampled_frame[i] < nTokens[IFile][i] && 
	       m<nTokens[IFile][i] && nTotal[i]<maxTotal; m++ ) {
	   n=BoundInteger((int)sampled_frame[i],0,nTokens[IFile][i]-1);
	   /* Move sampled_frame down to m */
	   TokenTimes[IFile][i][m] = TokenTimes[IFile][i][(int)sampled_frame[i]]; 
	   /* Increment the total number of tokens */
	   nTotal[i]++;
	   /* Move to the next frame to be sampled */        
	   sampled_frame[i] += sampling_period[i];                 
	 }
	 /* Move sampled_frame down modulo nTokens,... */
	 sampled_frame[i] -= nTokens[IFile][i]; 
	 /* ... then change nTokens to the number of frames actually selected */
	 nTokens[IFile][i]=m;                   
	 if(trace & T_VERBOSE) {
	   fprintf(stderr,"    (nTokens[i],nTotal[i])  (%d,%d)\n",
		   nTokens[IFile][i],nTotal[i]);
	   printf("    File %d of %d, class %d out of %d done\n",
		  IFile,NumFiles,i,nClasses);
	 }
       }
     }
   }
     
   /****************************************************************************/
   /*   Allocate an output SVMDef to hold all of the output, normalized tokens */
   /*   Assume that the most recently read HTKDATA has the right vector size   */
   /*   Assume also that maxTotal has been set to the actual max per-class count */
   ZeroSVMDef(&Output);
   Output.SV = CreateDMatrix(&xfStack,nClasses*maxTotal,IntVecSize(relfr) * HTKDATA.ncols);
   Output.alpha = CreateDVector(&xfStack, nClasses*maxTotal);
   Output.w = CreateDVector(&xfStack, nClasses*maxTotal);


   /* Go through all of the files, loading in all of the selected tokens */
   iTotal=0;
   for(IFile=0; IFile < NumFiles; IFile++) if(nTokens[IFile] != NULL)
     ResetHeap(&mStack);     
     ReadHParm(&mStack, FileList[IFile], &HTKDATA, trace);
     if(trace & T_VERBOSE) fprintf(stderr,"Loaded HTKData of size %dx%d from %s, %dth token\n",
				   NumDRows(HTKDATA.X),NumDCols(HTKDATA.X),FileList[IFile],iTotal);

     /* Find the basename, so that T_TOP printout will take less space */
     filename=CopyString(&mStack,FileList[IFile]);
     s=basename(filename);
     
     /* Extract the tokens of class #i, token #m from file #IFile  */
     if(HTKDATA.X != NULL) {
       for(i=0; i<nClasses; i++) 
	 for(m=0; m<nTokens[IFile][i] && iTotal<nClasses*maxTotal; m++) { 
	   /* Increment iTotal, */
	   /* write the label of this class into Output.alpha, */
	   /* and its index into Output.w */
	   Output.alpha[++iTotal] = strtod(classlabels[i],NULL);
	   Output.w[iTotal] = i;
	   
	   if(trace & T_TOP) 
	     fprintf(stderr,"Token %d: class %g, file %s, frame [",
		     iTotal,Output.alpha[iTotal],s);
	   /* Get frame #ifr relative to the current frame */
	   for(ifr=1; ifr<=IntVecSize(relfr); ifr++) {
	     n = BoundInteger(TokenTimes[IFile][i][m] + relfr[ifr],0,NumDRows(HTKDATA.X)-1);
	     if(trace & T_TOP) fprintf(stderr," %d",n);
	     for (d=1; d<=NumDCols(HTKDATA.X); d++) 
	       Output.SV[iTotal][(ifr-1)*NumDCols(HTKDATA.X)+d]=HTKDATA.X[n+1][d];
	   }
	   if(trace & T_TOP) 
	     fprintf(stderr," ] (%d us)\n",
		     (int)(TokenTimes[IFile][i][m]*(HTKDATA.sampPeriod)/10));
	 }
     }


   /**********************************************************************************/
   /* ----------- Compute statistics: mean, variance, and token count for each class */
   if(statsoutfile != NULL) {
     if(trace & T_VERBOSE) fprintf(stderr,"Computing statistics\n");
     ZeroSVMDef(&Stats);
     ZeroDMatrix(Stats.SV=CreateDMatrix(&xfStack,statsblocksize*nClasses,
					NumDCols(Output.SV)));
     ZeroDVector(Stats.alpha=CreateDVector(&xfStack,statsblocksize*nClasses));

     /* Label the mean, SD, and num-tokens lines with appropriate class ID */
     for(i=0; i<nClasses; i++) for(n=1; n<=3; n++) 
       Stats.alpha[i*statsblocksize+n] = strtod(classlabels[i],NULL);       

     /* Accumulate sum, sum-squared, and N, both global and class-dependent */
     for(m=1; m<=NumDRows(Output.SV); m++) for(n=1; n<=NumDCols(Output.SV); n++) {
       Stats.SV[1][n] += Output.SV[m][n];
       Stats.SV[2][n] += Output.SV[m][n] * Output.SV[m][n];
       (Stats.SV[3][n])++;
       Stats.SV[(int)Output.w[m]*statsblocksize+1][n] += Output.SV[m][n];
       Stats.SV[(int)Output.w[m]*statsblocksize+2][n] += Output.SV[m][n] * Output.SV[m][n];
       (Stats.SV[(int)Output.w[m]*statsblocksize+3][n])++;
     }
     /* Convert the accumulators into mean and standard deviation */
     for(i=0; i<nClasses; i++) for(d=1; d<=NumDCols(Stats.SV); d++) {
       p=i*statsblocksize;
       /* If there are at least two tokens, find the mean and SD, o.w. SD=0 and mean=sum */
       if(Stats.SV[p+3][d] >= 2) {
	 /* The H-J fast variance formula: Var = (SSQ-Sum*Sum/N) / (N-1) */
	 Stats.SV[p+2][d] = (Stats.SV[p+2][d]-Stats.SV[p+1][d]*Stats.SV[p+1][d]/Stats.SV[p+3][d])/(Stats.SV[p+3][d]-1);
	 Stats.SV[p+1][d] /= Stats.SV[p+3][d];                                 /* Mean = Sum/N */
	 Stats.SV[p+2][d] = (Stats.SV[p+2][d]>0) ? sqrt(Stats.SV[p+2][d]) : 0; /* SD=sqrt(Var) if Var>0, 0 otherwise */
       }
       else Stats.SV[p+2][d] = 0;                                              /* If N<2, SD=0 */
     }
     /* If normalization not previously defined, copy Stats to Norm */
     if(Norm.SV == NULL) {
       if(trace & T_VERBOSE) fprintf(stderr,"No normalization file read, so we'll use computed stats instead\n");
       CopySVMDef(&Norm, &Stats, &xfStack);
     }
     else if(trace & T_VERBOSE) fprintf(stderr,"Normalization will use stats read with -g option, not computed stats\n");
   }

   /**********************************************************************************/
   /* If normalization is available, normalize the output tokens --------------------*/
   if(Norm.SV != NULL) {
     if(trace & T_VERBOSE) fprintf(stderr,"Normalizing outputs: norm size (%d,%d), output size (%d,%d)\n",
				  NumDRows(Norm.SV),NumDCols(Norm.SV),NumDRows(Output.SV),NumDCols(Output.SV));
     if(NumDCols(Norm.SV) != NumDCols(Output.SV)) 
       fprintf(stderr,"VExtract: Can't normalize; normfile has %d columns, output has %d\n",NumDCols(Norm.SV),NumDCols(Output.SV));
     else if(NumDRows(Norm.SV)<2)
       fprintf(stderr,"VExtract: Can't normalize; normfile needs at least 2 rows\n");
     else for(n=1; n<=NumDCols(Output.SV); n++) {
       for(m=1; m<=NumDRows(Output.SV); m++) Output.SV[m][n] -= Norm.SV[1][n];
       if(Norm.SV[2][n] > 0)  for(m=1; m<=NumDRows(Output.SV); m++) Output.SV[m][n] /= Norm.SV[2][n];
     }
   }

   /**********************************************************************************/
   /* Compute the histogram, if one has been requested ------------------------------*/
   if(statsoutfile != NULL && statsblocksize > 3) {
     if(trace & T_VERBOSE) 
       fprintf(stderr,"Computing a histogram of size %d\n",statsblocksize-3);
     for(i=0; i<nClasses; i++) for(k=1; k<=DVectorSize(HThresh); k++) 
       Stats.alpha[i*statsblocksize+3+k]=HThresh[k];
     for(m=1; m<=NumDRows(Output.SV); m++) for(n=1; n<=NumDCols(Output.SV); n++) {
       /* Increment element in the global histogram */
       for(k=4; k<statsblocksize-1 && Output.SV[m][n] > Stats.alpha[k]; k++);
       (Stats.SV[k][n])++;
       (Stats.SV[((int)Output.w[m])*statsblocksize+k][n])++;
     }
   }

   /**********************************************************************************/
   /* Finally, print the statistics file, and the output, if so requested -----------*/
   if(statsoutfile != NULL) {
     if(trace & T_VERBOSE) fprintf(stderr,"Saving computed statistics to %s\n",statsoutfile);
     if((fid=fopen(statsoutfile,"w"))==NULL) {
       fprintf(stderr,"VApplySvms: Unable to write to %s\n",statsoutfile);
       perror("VExtract");
     }
     else {
       SaveSVMVectors(fid,&Stats);
       fclose(fid);
     }
   }
   if(outputfid != NULL) {
     if(trace & T_VERBOSE) fprintf(stderr,"Saving output tokens\n");
     SaveSVMVectors(outputfid,&Output);
     fclose(outputfid);
   }
   Exit(0);
}