/* ------------------------------------------------------------------------- */ /* VExtract.c: Extract Vectors whose transcription matches patterns Copyright 2005, Trustees of the University of Illinois Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Revision history: August 2004, Mark Hasegawa-Johnson: First revision created Likely future revisions: PVTK should be cleaned up, made independent of HTK, merged with SpeechLib, and rewritten from the bottom up to be compatible with LAPACK and STL. /* ------------------------------------------------------------------------- */ char *vextract_version = "$Id$"; #include "PVTK.h" #include #define MAX_PATTERNS 200 /* ---------------------- Global Variables ----------------------- */ static int trace = 0; /* Trace level */ static ConfParam *configs[MAXGLOBS]; static int nParm = 0; /* total num params */ static BufferInfo info; static MemHeap mStack; /* matrix stack */ static MemHeap xfStack; /* Transform stack -- not reset in every loop*/ /* ---------------- Usage Description ------------------------- */ static char *USAGE="\n\ USAGE: VExtract [opts] src1 ...\n\ \n\ Suggested usage:\n\ \n\ (1) Create listings of the training and testing files:\n\ \n\ ls traindata/*.htk > train.scp;\n\ ls testdata/*.htk > test.scp;\n\ \n\ (2) Suppose that MLF lines matching regular expression REG1 correspond to\n\ frames of class +1, while MLF lines matching REG2 correspond to -1.\n\ Assume that file MLF.mlf is an HTK-format master label file. \n\ Suppose that you want at most 5000 tokens of each class, but that\n\ you want to make sure to end up with the same number of tokens in each\n\ class, even if it means getting fewer than 10000 tokens.\n\ Compute the statistics, and extract the training tokens, using \n\ \n\ VExtract -T 1 -c /+1/REG1/ -c /-1/REG2/ -m 5000,corpus,locked\n\ -f train.stats -o train.toks -I MLF.mlf -S train.scp\n\ \n\ (3) Finally, extract the test tokens, but normalize using\n\ normalizing statistics from the training tokens:\n\ \n\ VExtract -T 1 -m 2000,corpus,locked -c /+1/REG1/ -c /-1/REG2/ \n\ -g train.stats -o test.toks -I MLF.mlf -S test.scp\n\ \n\ If a line matches regular expression, then all frames between the start\n\ time and end time specified on that line will be candidates for extraction.\n\ Regular expression syntax is specified in man 7 regex. Note, in \n\ particular, that word boundaries may be specified by the symbols [[:<:]] \n\ and [[:>:]] . The actual number of frames extracted from the corpus is \n\ specified by the -m option. \n\ \n\ Tokens are taken from uniformly spaced positions spanning each file.\n\ If -T 1 is specified, VExtract will print (to stderr) a specification\n\ of the frame number and filename from which each token vector was\n\ extracted.\n\ \n\ The MLF read by VExtract is not as flexible as a usual HTK MLF.\n\ Lines starting with a quotation mark are read as filenames. Lines\n\ starting with a digit are read as segment descriptors (start end\n\ label). Other lines are ignored.\n\ \n\ If -f is specified, a statsfile will be generated, and all output\n\ tokens will be normalized by the rule (x-mu)/sd. If -g is specified,\n\ the same type of normalization is applied, but mu and sd are read\n\ from the first two lines of the specified statsfile, rather than computed.\n\ If -g and -f are both specified, -g takes precedence, but a new statsfile is\n\ also written. statsfile is in svmlight format. First three lines are mean, SD,\n\ and number of tokens in each dimension. If -h is specifed, the next\n\ lines are labeled by the histogram threshold, with values equal to\n\ the histogram count. Histogram bins contain the number of tokens in each\n\ bin AFTER normalization, thus it is usually reasonable to specify thresholds\n\ in the range of roughly -2:0.2:2 (meaning -2 standard deviations, up to +2 sd).\n\ Following the global stats, stats are given separately for each class.\n\ \n\ Option \n\ \n\ -a outfile Append toks to outfile in svmlight format\n\ -c /LABEL/REGEX/\n\ When a line in the transcription matches REGEX,\n\ every frame between the start time and end time of\n\ that line is a candidate for extraction to class LABEL.\n\ -f statsfile Print mean and sd of extracted toks to statsfile\n\ -g statsfile Normalize outputs using stats in statsfile\n\ -h /th1,th2,th3/ Compute a histogram with given thresholds\n\ -h /b:s:e/ Compute a histogram with thresholds b,b+s,b+2s,...,e\n\ -m NUM,SOURCE,LOCKTYPE\n\ Read at most NUM vectors from each SOURCE.\n\ NUM is an integer, SOURCE is either `corpus' or `file.'\n\ LOCKTYPE is either `unlocked' (meaning that VExtract\n\ should extract as many vectors per class as possible, up to\n\ a maximum of NUM per SOURCE), or `locked' (meaning that\n\ VExtract should extract exactly the same number of vectors\n\ per class from each SOURCE, even if that number is fewer\n\ than NUM).\n\ -o outfile Write toks to outfile in svmlight format\n\ -t /t1,t2,t3,t4/ Concatenate frames t+t1,t+t2,t+t3,t+t4 \n\ -t /b:s:e/ Concatenate frames t+b,t+b+s,t+b+2*s,...,t+e\n\ -A Print command line arguments\n\ -I MLF Read transcriptions from master label file MLF\n\ -R Print RCS version information\n\ -S f Use script file f\n\ -T n Set trace level to n (meaningful: 1,3,7,15,31)\n\ "; void ReportUsage(char *msg) { printf(msg); printf(USAGE); printf("\n"); Exit(0); } /* SetConfParms: set conf parms relevant to this tool */ void SetConfParms(void) { int i; Boolean b; char buf[MAXSTRLEN]; nParm = GetConfig("VEXTRACT", TRUE, configs, MAXGLOBS); if (nParm>0){ if (GetConfInt(configs,nParm,"TRACE",&i)) trace = i; } } /******************************************************************/ /* ----------- main -------------- */ int main(int argc, char *argv[]) { extern char *optarg; extern int optind, opterr, optopt; char *s, *arg, *arg2; /* command line option and arguments */ char *filename; /* buffer for a filename */ char fullpath[MAX_LINE]; /* static buffer for the full pathname */ char *classlabels[MAX_CLASSES]; /* Labels for the classes */ regex_t classregex[MAX_CLASSES]; /* Regular expressions that define classes */ char *pat, *class; /* labels, and the patterns that define them */ char buf[MAXSTRLEN]; /* string buffer */ int nClasses=0; /* number of classes */ FILE *outputfid; /* Output file */ FILE *inputmlf=NULL; /* File pointer for input MLF */ int ***TokenTimes; /* Frame numbers of input tokens for each file */ int *TranscriptIndex; /* Transcript index for each file in the script */ int **nTokens; /* Number of tokens, per class, per input file */ int *nTotal; /* Total number of tokens found per class*/ int maxTotal=-1; /* Max number of tokens allowed per class */ int ifr,iTotal; /* Relative frame numbers to be concatenated */ IntVec relfr; /* Frame indices to output, relative to row number of current output vector */ int i,j,k,d,n,m,p,t; /* Counters */ int start_frame, end_frame; /* Start frame and end frame, in frame skip units */ int *numtoprint; /* Number of tokens of each class to print */ int *num_needed; /* Number of tokens of each class that are needed */ char *line; /* Input line read from the MLF, and its label field */ int nMaxInput=-1; /* Max number of tokens per input file */ int nTranscripts=0,iTranscript; /* Num input filenms whose transcripts have been read*/ char **TranscriptFilenames; /* Storage for transcription filenames */ int *nLabels, iLabel; /* Storage for number of labels in each MLF */ char ***TranscriptLabels; /* Storage for label strings in each transcript */ int **TranscriptStartTimes; /* Start times */ int **TranscriptEndTimes; /* End times */ char *statsoutfile=NULL; /* Output file for summary statistics */ FILE *fid=NULL; /* Miscellaneous i/o file FID */ int statsblocksize=3; /* Statistics block size = 3 if there is no histogram */ SVMDef Stats; /* Matrix for computed stats */ SVMDef Norm; /* Matrix for normalizing stats */ SVMDef Output; /* Matrix for output tokens */ int usable_file=1; /* Flag: is current file usable? */ char locking_type='c'; /* Locking type: file, corpus, or none */ char **FileList; /* List of files, read from the command line */ int NumFiles,IFile; /* number of files, and file counter */ DVector HThresh=NULL; /* Thresholds for histogram computation */ double *sampling_period; /* Sample one frame per sampling_period[iClass] */ double *sampled_frame; /* Most recent frame is sampled_frame[iClass]; modulo file length across files */ HParm HTKDATA; /* Storage for data read from HTK HParm file */ if(InitShell(argc,argv,vextract_version,"")= MAX_CLASSES) { snprintf(buf,MAXSTRLEN,"VExtract: max num classes exceeded: (%d, class %s)\n", MAX_CLASSES, class); ReportUsage(buf); } /* Create a new class */ classlabels[nClasses] = (char *)calloc(strlen(class)+1, sizeof(char)); strcpy(classlabels[nClasses], class); /* Get the regular expression, compile it, and store it */ if (NextArg() != STRINGARG) ReportUsage("VExtract: REGEX expected after -c\n"); pat=remove_wrapper(CopyString(&xfStack, GetStrArg())); if(i=regcomp(&(classregex[nClasses]), pat, REG_NOSUB)) { /* If regular expression parser returned an error, print it out */ regerror(i, &(classregex[nClasses]), buf, MAXSTRLEN); fprintf(stderr, "VExtract: In parsing of regex %s, parser returned error message %s\n", pat,buf); Exit(-1); } /* Increment number of classes; Done */ nClasses++; break; case 'f': if (NextArg() != STRINGARG) ReportUsage("VExtract: Output filename expected after -f\n"); statsoutfile=CopyString(&xfStack,GetStrArg()); if(trace & T_OPTS) fprintf(stderr,"VExtract: Will write statistics to %s\n",statsoutfile); break; case 'g': if (NextArg() != STRINGARG) ReportUsage("VExtract: Input filename expected after -g\n"); filename=GetStrArg(); ReadSVMFile(filename, &xfStack, &Norm, trace); if(NumDRows(Norm.SV) < 2) HError(1,"VExtract: -g option must be svmtoks file with at least two vectors\n"); if(Norm.alpha[1] != 0 || Norm.alpha[2] != 0) { fprintf(stderr, "VExtract: labels on normalization offset and scale (%d and %d, rows 1 and 2 of %s) will be ignored\n", Norm.alpha[1],Norm.alpha[2],filename); } if(trace & T_VERBOSE) { fprintf(stderr, "Read normalization stats from %s, will normalize by (x-mu)/sd:\n mu ", filename); for(n=1; n<=3 && n<=NumDCols(Norm.SV); n++) fprintf(stderr,"%d:%g ",n,Norm.SV[1][n]); fprintf(stderr,"...\n sd "); for(n=1; n<=3 && n<=NumDCols(Norm.SV); n++) fprintf(stderr,"%d:%g ",n,Norm.SV[2][n]); fprintf(stderr,"...\n"); } break; case 'h': if (NextArg() != STRINGARG) ReportUsage("VExtract: Vector definition string expected after -h\n"); HThresh = strtoDVector(&xfStack, GetStrArg()); /* Add enough room to the stats block in order to store histogram in each class */ statsblocksize = 4 + DVectorSize(HThresh); if (trace & T_OPTS) { fprintf(stderr,"Histogram will be computed with %d thresholds: [%g", DVectorSize(HThresh),HThresh[1]); for(i=2;i<=DVectorSize(HThresh);i++) fprintf(stderr,",%g",HThresh[i]); fprintf(stderr,"]\n"); } break; case 'm': /* Get information about the number, source, and locking type of toks */ /* First, find out how many tokens are desired */ if (NextArg() != INTARG) ReportUsage("VExtract: -m should be followed by an integer\n"); maxTotal=GetChkedInt(0,10000,s); /* Second, find out if this number is desired once per file or per corpus */ if (NextArg() != STRINGARG) ReportUsage("VExtract: -m should be followed by NUM SOURCE LOCKTYPE\n"); pat=GetStrArg(); /* If source is `file' or `FILE', make this the number per file, not per corpus */ if(*pat=='f' || *pat=='F') { nMaxInput=maxTotal; maxTotal=-1; } /* Otherwise: if SOURCE argument is unrecognized, call an error message */ else if(*pat != 'c' && *pat != 'C') { snprintf(buf,MAXSTRLEN, "SOURCE argument of -m must be `file' or `corpus,' not %s\n",pat); ReportUsage(buf); } /* Third, find out whether or not to lock the corpus */ if (NextArg() != STRINGARG) ReportUsage("VExtract: -m should be followed by NUM SOURCE LOCKTYPE\n"); pat=GetStrArg(); /* Determine locking_type */ if(*pat=='l' || *pat=='l') { if(nMaxInput >= 0) locking_type='f'; else locking_type='c'; } else if(*pat=='u' || *pat=='U') locking_type='n'; /* If locking_type not recognized, print an error */ else { snprintf(buf,MAXSTRLEN, "LOCKTYPE argument of -m must be `locked' or `unlocked,' not %s\n",pat); ReportUsage(buf); } /* Trace information */ if (trace & T_OPTS) { if(nMaxInput >= 0) fprintf(stderr,"Will output at most %d tokens/class/file,",nMaxInput); else fprintf(stderr,"Will output at most %d tokens/class total,",maxTotal); if(locking_type=='n') fprintf(stderr," different classes may have different token counts\n"); else fprintf(stderr," different classes will have the same token count\n"); } break; case 'o': if (NextArg() != STRINGARG) ReportUsage("VExtract: Output filename expected after -o\n"); if((outputfid=fopen((filename=GetStrArg()),"w"))==NULL) { perror("VExtract"); HError(1,"VExtract: Unable to write to output file %s\n",filename); } if(trace & T_OPTS) fprintf(stderr,"VExtract: Will write output to %s\n",filename); break; case 't': /* Specify relative frame numbers that will be concatenated */ if (NextArg() != STRINGARG) ReportUsage("VExtract: -t should be followed by a vector of offsets\n"); relfr = strtoIntVec(&xfStack, GetStrArg()); if (trace & T_OPTS) { fprintf(stderr,"Output lines will contain %d frames: [%d",IntVecSize(relfr),relfr[1]); for(ifr=2;ifr<=IntVecSize(relfr);ifr++) fprintf(stderr,",%d",relfr[ifr]); fprintf(stderr,"]\n"); } break; case 'A': /* Repeat the arguments to stderr */ for(m=0; m<=argc; m++) fprintf(stderr,"%s ",argv[m]); break; case 'I': if (NextArg() != STRINGARG) ReportUsage("VExtract: Input MLF filename expected after -I\n"); if((inputmlf=fopen((arg=GetStrArg()),"r"))==NULL) { perror("VExtract"); HError(1,"VExtract: Unable to read from input MLF %s\n",arg); } /* Read the input file transcriptions */ nTranscripts = read_MLF(inputmlf, &xfStack, nTranscripts,TranscriptFilenames,nLabels,TranscriptLabels, TranscriptStartTimes,TranscriptEndTimes); if(trace & T_OPTS) fprintf(stderr,"Read %d file descriptions from %s\n",nTranscripts,arg); fclose(inputmlf); break; case 'R': fprintf(stderr,"%s\n",vextract_version); PrintRCSIdentifier(stderr); Exit(0); case 'T': trace = GetChkedInt(0,255,s); break; case 'V': fprintf(stderr,vextract_version); Exit(0); default: snprintf(buf,MAXSTRLEN,"VExtract: Unknown switch %s",s); ReportUsage(buf); } } /* Verify the defined classes */ if (trace & T_TOP) for(i=0; i 0) { if (NextArg()!=STRINGARG) HError(1,"VExtract: expected only filenames after options are done"); FileList[NumFiles] = CopyString(&xfStack,GetStrArg()); NumFiles++; } /**************************************************************************/ /* ----------- First time through the list of files: */ /* -------------Figure out how many files are in each class, */ /* -------------Check to make sure that we have an open and working MLF */ if (inputmlf==NULL) ReportUsage("VExtract: Input MLF must be specified"); /* Allocate space to count the number of tokens per class */ nTotal = (int *)calloc(nClasses,sizeof(int)); for(IFile=0; IFile < NumFiles; IFile++) { if((fid=fopen(FileList[IFile],"r"))==NULL) { fprintf(stderr,"VExtract: unable to read from %s\n",FileList[IFile]); perror("VExtract"); } else { /* Read header to find sampling period and number of rows, then close file */ ReadHParmHeader(fid, &HTKDATA); fclose(fid); /* Expand the filename to get rid of . and .. */ realpath(FileList[IFile],fullpath); /* Skip past the directory structure to the filename */ filename=basename(fullpath); /* Snip off the extension, if any */ if((s=strchr(filename,'.'))!=NULL) *s='\0'; /* Look in TranscriptFilenames for the filename */ for(iTranscript=0; iTranscript=nTranscripts) fprintf(stderr,"VExtract: Unable to find root file %s in any MLF; skipping\n", filename); else { TranscriptIndex[IFile]=iTranscript; if(trace & T_VERBOSE) fprintf(stderr, "%s (%d'th input) and %s (%d'th transcript, %d lines) matched %s\n", FileList[IFile],IFile,TranscriptFilenames[iTranscript], iTranscript,nLabels[iTranscript],filename); /* Allocate space for the token times: up to the total number of */ /* frames in the file, according to HTKDATA */ TokenTimes[IFile]=(int **)calloc(nClasses, sizeof(int *)); for(i=0; i nTotal[i]) maxTotal=nTotal[i]; if(maxTotal==0) { fprintf(stderr, "VExtract: number of output tokens for all classes has been set to zero\n"); Exit(0); } /* Set the sampling period for this class: either nTotal[i]/maxTotal, or 1.0 */ for (i=0; i0 && maxTotal0 && nMaxInput= 2) { /* The H-J fast variance formula: Var = (SSQ-Sum*Sum/N) / (N-1) */ Stats.SV[p+2][d] = (Stats.SV[p+2][d]-Stats.SV[p+1][d]*Stats.SV[p+1][d]/Stats.SV[p+3][d])/(Stats.SV[p+3][d]-1); Stats.SV[p+1][d] /= Stats.SV[p+3][d]; /* Mean = Sum/N */ Stats.SV[p+2][d] = (Stats.SV[p+2][d]>0) ? sqrt(Stats.SV[p+2][d]) : 0; /* SD=sqrt(Var) if Var>0, 0 otherwise */ } else Stats.SV[p+2][d] = 0; /* If N<2, SD=0 */ } /* If normalization not previously defined, copy Stats to Norm */ if(Norm.SV == NULL) { if(trace & T_VERBOSE) fprintf(stderr,"No normalization file read, so we'll use computed stats instead\n"); CopySVMDef(&Norm, &Stats, &xfStack); } else if(trace & T_VERBOSE) fprintf(stderr,"Normalization will use stats read with -g option, not computed stats\n"); } /**********************************************************************************/ /* If normalization is available, normalize the output tokens --------------------*/ if(Norm.SV != NULL) { if(trace & T_VERBOSE) fprintf(stderr,"Normalizing outputs: norm size (%d,%d), output size (%d,%d)\n", NumDRows(Norm.SV),NumDCols(Norm.SV),NumDRows(Output.SV),NumDCols(Output.SV)); if(NumDCols(Norm.SV) != NumDCols(Output.SV)) fprintf(stderr,"VExtract: Can't normalize; normfile has %d columns, output has %d\n",NumDCols(Norm.SV),NumDCols(Output.SV)); else if(NumDRows(Norm.SV)<2) fprintf(stderr,"VExtract: Can't normalize; normfile needs at least 2 rows\n"); else for(n=1; n<=NumDCols(Output.SV); n++) { for(m=1; m<=NumDRows(Output.SV); m++) Output.SV[m][n] -= Norm.SV[1][n]; if(Norm.SV[2][n] > 0) for(m=1; m<=NumDRows(Output.SV); m++) Output.SV[m][n] /= Norm.SV[2][n]; } } /**********************************************************************************/ /* Compute the histogram, if one has been requested ------------------------------*/ if(statsoutfile != NULL && statsblocksize > 3) { if(trace & T_VERBOSE) fprintf(stderr,"Computing a histogram of size %d\n",statsblocksize-3); for(i=0; i Stats.alpha[k]; k++); (Stats.SV[k][n])++; (Stats.SV[((int)Output.w[m])*statsblocksize+k][n])++; } } /**********************************************************************************/ /* Finally, print the statistics file, and the output, if so requested -----------*/ if(statsoutfile != NULL) { if(trace & T_VERBOSE) fprintf(stderr,"Saving computed statistics to %s\n",statsoutfile); if((fid=fopen(statsoutfile,"w"))==NULL) { fprintf(stderr,"VApplySvms: Unable to write to %s\n",statsoutfile); perror("VExtract"); } else { SaveSVMVectors(fid,&Stats); fclose(fid); } } if(outputfid != NULL) { if(trace & T_VERBOSE) fprintf(stderr,"Saving output tokens\n"); SaveSVMVectors(outputfid,&Output); fclose(outputfid); } Exit(0); }