/* -------------------------------------------------------------------------------- */ /* File: PVTKLib.c: Library functions for PVTK A very small toolkit for interfacing HTK and svmlight Copyright 2005, Trustees of the University of Illinois Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Revision history: May 2005, Sarah Borys: Fixed bugs in ReadHParm and WriteHParm August 2004, Mark Hasegawa-Johnson: First revision created Likely future revisions: PVTK should be cleaned up, made independent of HTK, merged with SpeechLib, and rewritten from the bottom up to be compatible with LAPACK and STL. /* -------------------------------------------------------------------------------- */ char *pvtklib_version = "$Id: PVTKLib.c,v 1.5 2004/08/12 19:16:55 mhasegaw Exp mhasegaw $"; #include "PVTK.h" /* Global variables */ /* Call uname to check the architecture; store information in islittleendian_storage */ /* Return 1 for little-endian, 0 for big-endian or unknown */ int islittleendian_storage; int islittleendian_storage_flag; struct utsname architecture_storage; int islittleendian() { if(islittleendian_storage_flag != 363) { islittleendian_storage_flag = 363; /* A silly non-zero number to tell us the flag has been set */ /* Figure out whether big-endian or little-endian data */ uname(&architecture_storage); if(strncmp(architecture_storage.machine,"i686",4)==0) islittleendian_storage=1; else if(strncmp(architecture_storage.machine,"i386",4)==0) islittleendian_storage=1; else if(strncmp(architecture_storage.machine,"i486",4)==0) islittleendian_storage=1; else if(strncmp(architecture_storage.machine,"i586",4)==0) islittleendian_storage=1; else if(strncmp(architecture_storage.machine,"alpha",4)==0) islittleendian_storage=1; else if(strncmp(architecture_storage.machine,"sun4u",4)==0) islittleendian_storage=0; else if(strncmp(architecture_storage.machine,"sparc",4)==0) islittleendian_storage=0; else islittleendian_storage=0; } return(islittleendian_storage); } /* Debug information */ void PrintRCSIdentifier(FILE *fid) { fprintf(fid,"%s\n",pvtklib_version); } /* --------------- New Handlers for DMatrix, DVector --------------------- */ /* RawReadDouble: read double data from src in ascii */ Boolean RawReadDouble(FILE *fid, double *x, int n, Boolean bin, Boolean swap) { int k,count=0,j; double *p; for (j=1; j<=n; j++){ if (fscanf(fid,"%le%n",x,&k) != 1) return FALSE; x++; count += k; } return TRUE; } /* Byteswap data of arbitrary size */ size_t byteswap(char *ptr, size_t size, size_t nmemb) { size_t halfsize, m, n; char *p1, *p2, tmp; /* Compute half the size */ halfsize=size/2; for(n=0; nmax)?max:n)); } /* Check Vector start index to make sure it's in range */ int CheckVectorStart(int s, int n, int size) { if (s<0) s=0; if (s+n > size) s=size-n; return(s); } /* Find the maximum of each column in a DMatrix */ void DMatrixMax(DMatrix A, DVector v) { int m,n; if(A==NULL) { v=NULL; return; } CopyDVector(A[1],v); for(m=2; m<=NumDRows(A); m++) for(n=1; n<=NumDCols(A); n++) if(A[m][n]>v[n]) v[n]=A[m][n]; } /* Find the minimum of each column in a DMatrix */ void DMatrixMin(DMatrix A, DVector v) { int m,n; if(A==NULL) { v=NULL; return; } CopyDVector(A[1],v); for(m=2; m<=NumDRows(A); m++) for(n=1; n<=NumDCols(A); n++) if(A[m][n]DVectorSize(v1)) n=(s1<0)?DVectorSize(v1):DVectorSize(v1)-s1; s1=CheckVectorStart(s1,n,DVectorSize(v1)); s2=CheckVectorStart(s2,n,DVectorSize(v2)); for (; n>0; n--) v2[s2+n] = v1[s1+n]; } /* copy column subvector m1[r1+[1:n]][c1] into subvector m2[r2+[1:n]][c2] */ /* Default n: DVectorSize(v1)-s1 */ void CopySubColumnDD(DMatrix m1, int r1, int c1, DMatrix m2, int r2, int c2, int n) { if (n==0||n>NumDRows(m1)-r1) n=(r1<0)?NumDRows(m1):NumDRows(m1)-r1; r1=CheckVectorStart(r1,n,NumDRows(m1)); c1=(c1>NumDCols(m1))?NumDCols(m1):(c1<1?1:c1); r2=CheckVectorStart(r2,n,NumDRows(m2)); c2=(c2>NumDCols(m2))?NumDCols(m2):(c2<1?1:c2); for (; n>0; n--) m2[r2+n][c2] = m1[r1+n][c1]; } /* Zero-out subvector v1[s1+[1:n]] */ /* Default n: DVectorSize(v1)-s1 */ void ZeroSubVectorDD(DVector v1, int s1, int n) { if (n==0||n>DVectorSize(v1)) n=(s1<0)?DVectorSize(v1):DVectorSize(v1)-s1; s1=CheckVectorStart(s1,n,DVectorSize(v1)); for (; n>0; n--) v1[s1+n]=0; } /* copy subvector of v1 into subvector of v2 */ void CopySubVectorDF(DVector v1, Vector v2, int s1, int s2, int n) { if (n==0||n>DVectorSize(v1)) n=(s1<0)?DVectorSize(v1):DVectorSize(v1)-s1; s1=CheckVectorStart(s1,n,DVectorSize(v1)); s2=CheckVectorStart(s2,n,VectorSize(v2)); for (; n>0; n--) v2[s2+n] = (float) v1[s1+n]; } /* copy subvector of v1 into subvector of v2 */ void CopySubVectorFD(Vector v1, DVector v2, int s1, int s2, int n) { if (n==0||n>VectorSize(v1)) n=(s1<0)?VectorSize(v1):VectorSize(v1)-s1; s1=CheckVectorStart(s1,n,VectorSize(v1)); s2=CheckVectorStart(s2,n,DVectorSize(v2)); for (; n>0; n--) v2[s2+n] = v1[s1+n]; } /* B = transpose(A) */ void TransposeDMatrix(DMatrix A, DMatrix B) { int i, j, m, n; if((m = NumDRows(A))!=NumDCols(B)) HError(5270,"TransposeDMatrix: Rows(A)=%d should equal Cols(B)=%d\n", m, NumDCols(B)); if((n = NumDCols(A))!=NumDRows(B)) HError(5270,"TransposeDMatrix: Cols(A)=%d should equal Rows(B)=%d\n", n, NumDRows(B)); for(i=1; i<=m; i++) for(j=1; j<=n; j++) B[j][i]=A[i][j]; } /* Multiply two DMatrices , C = A*B */ void MultiplyDMatrices(DMatrix C, DMatrix A, DMatrix B) { int i, j, k, Crows, Ccols, Acols; if((Crows = NumDRows(A)) != NumDRows(C)) HError(5270,"MultiplyDMatrices: Rows(A)=%d should equal Rows(C)=%d\n", Crows,NumDRows(C)); if((Ccols = NumDCols(B)) != NumDCols(C)) HError(5270,"MultiplyDMatrices: Cols(B)=%d should equal Cols(C)=%d\n", Ccols,NumDCols(C)); if ((Acols=NumDCols(A)) != NumDRows(B)) HError(5270,"MultiplyDMatrices: Cols(A)=%d should equal Rows(B)=%d\n", Acols, NumDRows(B)); for (i=1; i <= Crows; i++ ) for (k=1; k <= Ccols; k++) { C[i][k] = A[i][1] * B[1][k]; for (j=2; j <= Acols; j++ ) C[i][k] += A[i][j]*B[j][k]; } } /* Add two DVectors, C = A+B */ void AddDVectors(DVector C, DVector A, DVector B) { int i, size; size=DVectorSize(A); if (size != DVectorSize(B) || size != DVectorSize(C)) HError(5270,"AddVectors: Sizes(A,B,C)=(%d,%d,%d) should be equal\n", size, DVectorSize(B), DVectorSize(C)); for (i=1; i<=size; i++) C[i] = A[i] + B[i]; } /* Read an integer vector from a string */ /* Two formats are understood: */ /* start:step:end (or the degenerate form, start:end) */ /* i1,i2,i3,... */ /* Initial and final characters are ignored unless digits, '-', '.', ',', ':' */ /* Return value is the number of elements read */ IntVec strtoIntVec(MemHeap *heap, char *s) { char *p, *rem; int m, i, nfr, ifr, start, step, stop; IntVec v; /* Check for colons in the input, indicating start:step:end syntax */ if((p=strchr(s, ':')) != NULL && p>s) { /* Work backward through all digits and '-' signs to find beginning of the vector */ for(p--; p >= s && (isdigit(*p) || *p=='-'); p-- ) ; /* Try to read start:step:end; count the number of fields we get to read */ start=strtol(p+1,&rem,10); if(rem==p+1) {fprintf(stderr,"strtoIntVec unable to read start from %s\n",s);return(NULL);} step=strtol(rem+1,&p,10); if(rem+1==p) {fprintf(stderr,"strtoIntVec unable to read step from %s\n",s);return(NULL);} stop=strtol(p+1,&rem,10); if(rem==p+1) {stop=step; step=1; } fprintf(stderr,"Read %d, %d, %d from %s\n",start,step,stop,s); nfr=(stop-start)/step+1; v = CreateIntVec(heap, nfr); for(i=1; i<=nfr; i++) v[i]=start+(i-1)*step; fprintf(stderr,"Created a vector of size %d: [%g",IntVecSize(v), v[1]); for(i=2; i<=nfr; i++) fprintf(stderr,",%g",v[i]); fprintf(stderr,"]\n"); } else { for(nfr=1, p=s; *p!='\0'; p++) if(*p==',') nfr++; /* First count the commas */ v = CreateIntVec(heap, nfr); /* Allocate and zero the vector */ ZeroIntVec(v); /* For each vector element: skip any initial non-digits, then use strtol to get the integer */ for(p=NULL, rem=s, ifr=1; p != rem && *rem != '\0' && ifr<=nfr; v[ifr++]=strtol(p,&rem,10)) for(p=rem; isdigit(*p)==0 && *p != '-' && *p != '\0'; p++) ; } return(v); } /* Remove the // wrapper from an input string */ /* If the string has length at least two characters, and if the first and last characters are the same, */ /* then those two characters are removed (the string content is changed!!), and the result is returned */ char *remove_wrapper(char *pat) { if(strlen(pat) > 1) if(pat[0] == pat[strlen(pat)-1]) { pat[strlen(pat)-1]='\0'; return(pat+1); } return(pat); } /* Read a DVector from a string */ /* Two formats are understood: */ /* start:step:end (or the degenerate form, start:end) */ /* x1,x2,x3,... */ /* Initial and final characters are ignored unless digits, '-', '.', ',', ':' */ /* Return value is the number of elements read */ DVector strtoDVector(MemHeap *heap, char *s) { char *p, *rem; int m, n, i, nfr, ifr; double start, step, stop; DVector v; /* Check for colons in the input, indicating start:step:end syntax */ if((p=strchr(s, ':')) != NULL && p>s) { /* Work backward through all digits, '-', 'e', 'E', and '.' signs to find beginning of the vector */ for(p--; p >= s && (isdigit(*p) || *p=='-' || *p=='.' || *p=='e' || *p=='E'); p-- ) ; /* Try to read start:step:end; count the number of fields we get to read */ start=strtod(p+1,&rem); if(rem==p+1) {fprintf(stderr,"strtoIntVec unable to read start from %s\n",s);return(NULL);} step=strtod(rem+1,&p); if(rem+1==p) {fprintf(stderr,"strtoIntVec unable to read step from %s\n",s);return(NULL);} stop=strtod(p+1,&rem); if(rem==p+1) {stop=step; step=1; } nfr=floor((stop-start)/step)+1; v = CreateDVector(heap, nfr); for(i=1; i<=nfr; i++) v[i]=start+(i-1)*step; } else { for(nfr=1, p=s; *p!='\0'; p++) if(*p==',') nfr++; /* First count the commas */ v = CreateDVector(heap, nfr); /* Allocate and zero the vector */ ZeroDVector(v); /* For each vector element: skip any initial non-digits, then use strtod to get the value */ for(p=NULL, rem=s, ifr=1; p != rem && *rem != '\0' && ifr<=nfr; v[ifr++]=strtod(p,&rem)) for(p=rem; isdigit(*p)==0 && *p != '-' && *p != '\0' && *p != '.' && *p != 'e' && *p != 'E'; p++) ; } return(v); } /* -------------------- Input and Output Files ------------------------ */ /* -------------------------------------------- */ /* Read the next space-separated word from an input stream, up to SIZE characters */ /* A "space-separated word" includes an arbitrary sequence of non-whitespace chars, */ /* followed by an arbitrary sequence of space chars. If the first char is a space, */ /* then all returned characters will be space!!! */ /* Return value is equal to w if a nonzero number of chars were read before feof, otherwise NULL */ char *fgetw(char *w, int size, FILE *fid) { int n; /* First, read an arbitrary sequence of non-space characters */ for(w[(n=0)]=fgetc(fid); !isspace(w[n]) && !feof(fid) && n=size) { if(n==0) return(NULL); /* If we only read one character, and it was EOF */ if(w[n]==EOF) w[n]='\0'; /* If the last character read was EOF */ else w[++n]='\0'; /* If we stopped because we exceeded size */ return(w); } /* Second, read an arbitrary sequence of space characters */ while(isspace(w[n]) && !feof(fid) && nncols) ncols=icol; /* If the column number is higher than any we've heard of, use it */ new_row=0; /* Reset the new_row flag */ } /* If a newline was encountered in the post-word spaces, set the new_row flag */ if(strchr(w,'\n')!=NULL) new_row=1; } /* Re-read the input file, loading labels and data into arrays */ rewind(fid); s->kerneltype = K_INPUT; /* Type marker for "input vectors */ /* If nrows > 0, allocate space, and return the structure */ if(nrows > 0) { s->SV = CreateDMatrix(heap, nrows, ncols); s->alpha = CreateDVector(heap, nrows); return(s); } /* If not, return null */ s->SV = NULL; s->alpha = NULL; return(NULL); } /*******************************************************************/ /* Read an SVMlight header, and put values into the SVMDef struct */ /* If we can find a nonzero SV matrix in this file, allocate it, */ /* and return the SVMDef struct pointer. If not, return NULL */ /*******************************************************************/ SVMDef *ReadSVMLightHeader(FILE *fid, MemHeap *heap, SVMDef *s) { int nrows=0,ncols=0,icol; double a; int n,new_row=1; char w[MAX_LINE], *rem; w[0]='\0'; /* Continue until we've read "each following line is a SV" */ while(strstr(w,"each following line")==NULL) { /* Lines in the header should be pretty short, so we can use fgets */ fgets(w,MAX_LINE,fid); /* See if there is useful parameter information in this line */ if(strstr(w,"kernel type") != NULL) { s->kerneltype=strtol(w,&rem,10); } else if(strstr(w,"kernel parameter -d") != NULL) { s->d=strtod(w,&rem); } else if(strstr(w,"kernel parameter -g") != NULL) { s->g=strtod(w,&rem); } else if(strstr(w,"kernel parameter -s") != NULL) { s->s=strtod(w,&rem); } else if(strstr(w,"kernel parameter -r") != NULL) { s->r=strtod(w,&rem); } else if(strstr(w,"highest feature index") != NULL) { ncols=strtol(w,&rem,10); } else if(strstr(w,"number of support vectors plus 1") != NULL) { nrows=strtol(w,&rem,10) - 1; } else if(strstr(w,"threshold b") != NULL) { s->b=strtod(w,&rem); } } /* If nrows > 0, allocate space, and return the structure */ if(nrows > 0) { s->SV = CreateDMatrix(heap, nrows, ncols); s->alpha = CreateDVector(heap, nrows); return(s); } /* If not, return null */ s->SV = NULL; s->alpha = NULL; return(NULL); } /*******************************************************************/ /* Read a libSVM header, and put values into the SVMDef struct */ /* If we can find a nonzero SV matrix in this file, allocate it, */ /* and return the libSVM struct pointer. If not, return NULL */ /*******************************************************************/ SVMDef *ReadLibSVMHeader(FILE *fid, MemHeap *heap, SVMDef *s) { int nrows=0,ncols=0,icol; double a; int n,new_row=1; char w[MAX_LINE], *rem; /* In order to allocate the matrix size, in libSVM, we need to read the tokens. Do it */ s=guess_svmtoks_size(fid, heap, s); if(s==NULL) return(NULL); /* Now read the header in order to see what else we can find out */ w[0]='\0'; /* Continue until we've read "SV" */ while(strstr(w,"SV")==NULL) { /* Lines in the header should be pretty short, so we can use fgets */ fgets(w,MAX_LINE,fid); /* Kernel type? */ if(strstr(w,"kernel_type") != NULL) { if(strstr(w,"linear")!=NULL) s->kerneltype=0; else if(strstr(w,"polynomial")!=NULL) s->kerneltype=1; else if(strstr(w,"rbf")!=NULL) s->kerneltype=2; else if(strstr(w,"sigmoid")!=NULL) s->kerneltype=3; else fprintf(stderr,"ReadLibSVMHeader doesn't grok kernel of type %s\n",w); } /* gamma? */ if(strstr(w,"gamma")!=NULL) { rem=strchr(w,' '); if(rem!=NULL) s->g=strtod(rem,NULL); } /* degree? */ if(strstr(w,"degree")!=NULL) { rem=strchr(w,' '); if(rem!=NULL) s->d=strtod(rem,NULL); } /* coef0? */ if(strstr(w,"coef0")!=NULL) { rem=strchr(w,' '); if(rem!=NULL) s->r=strtod(rem,NULL); } } /* Return s */ return(s); } void ZeroSVMDef(SVMDef *s) { s->kerneltype=0; s->SV=NULL; s->alpha=NULL; s->w=NULL; s->b=0; s->d=0; s->g=0; s->r=0; s->s=0; } /*******************************************************************/ /* Read a matrix from an svmlight-format file */ /* If we can find a nonzero SV matrix in this file, allocate it, */ /* and return the SVMDef struct pointer. If not, return NULL */ /*******************************************************************/ SVMDef *ReadSVMFile(char *filename, MemHeap *heap, SVMDef *s, int trace) { int nrows=0,ncols=0,irow=0,icol=0; double a; int n,m,new_row=1,row_counted=0; char w[MAX_LINE], *rem; FILE *fid; /* If s is null, return */ if(s==NULL || heap==NULL) { fprintf(stderr,"ReadSVMFile(%s) called with null SVMDef\n",filename); return(NULL); } /* Initialize s */ ZeroSVMDef(s); /* Try to open the input file */ if((fid=fopen(filename,"r"))==NULL) { fprintf(stderr,"ReadSVMFile: Unable to read from %s\n",filename); perror("ReadSVMFile"); return(NULL); } /* Read the first word */ if(fgetw(w,MAX_LINE,fid) == NULL) { fprintf(stderr,"ReadSVMFile: Unable to read from %s\n",filename); perror("ReadSVMFile"); fclose(fid); return(NULL); } /* If any characters in w are not printable, then the file is not an svmtoks file -- return NULL */ for(rem=w; *rem!='\0'; rem++) if(!isprint(*rem)) { fprintf(stderr,"ReadSVMFile: Unable to read ASCII text from %s\n",filename); fclose(fid); return(NULL); } /* Test the first word: if it contains "SVM-light," call ReadSVMLightHeader */ if(strstr(w,"SVM-light")!= NULL) { s = ReadSVMLightHeader(fid, heap, s); if(trace & T_VERBOSE) { if(s!=NULL) fprintf(stderr,"Reading SVM-light SVM of size (%d,%d) from %s\n", NumDRows(s->SV),NumDCols(s->SV),filename); else fprintf(stderr,"Failed to read an SVM definition from %s\n",filename); } } /* Otherwise -- check to see if this is a libSVM header, and if so, read it as such */ else if(strstr(w,"svm_type")!=NULL) { s = ReadLibSVMHeader(fid, heap, s); if(trace & T_VERBOSE) { if(s!=NULL) fprintf(stderr,"Reading libSVM SVM of size (%d,%d) from %s\n", NumDRows(s->SV),NumDCols(s->SV),filename); else fprintf(stderr,"Failed to read an SVM definition from %s\n",filename); } } /* Otherwise, read through the file looking for index:value pairs, and guess the svmtoks size */ else { s = guess_svmtoks_size(fid, heap, s); if(trace & T_VERBOSE) { if(s!=NULL) fprintf(stderr,"Reading matrix of size (%d,%d) from text file %s\n",NumDRows(s->SV),NumDCols(s->SV),filename); else fprintf(stderr,"Failed to read a matrix from %s\n",filename); } } /* If header read returned null, send the null back up to caller */ if(s==NULL) { fprintf(stderr,"ReadSVMFile: Unable to read SVM header from %s\n",filename); fclose(fid); return(NULL); } nrows=DVectorSize(s->alpha); /* Now that the header is in --- read the support vectors */ while(fgetw(w,MAX_LINE,fid) != NULL) { /* If new_row==1, read the row label or alpha value */ if(new_row==1 && irow < nrows) { s->alpha[irow+1]=strtod(w,&rem); new_row=0; /* This is no longer a new row */ row_counted=0; /* ... but the row has not yet been counted */ } /* Can we read index:value from this word? If so, do it!! */ if(sscanf(w,"%d:%lg",&icol,&a) == 2) { if(row_counted==0 && irowSV[irow][icol] = a; /* Enter the value in the matrix */ } /* If a newline was encountered in the post-word whitespace, set the new_row flag */ if(strchr(w,'\n')!=NULL) new_row=1; } /* If linear, compute the normal vector */ if(s->kerneltype == 0) { s->w=CreateDVector(heap, NumDCols(s->SV)); for(n=1; n<=NumDCols(s->SV); n++) { s->w[n] = s->alpha[1]*s->SV[1][n]; for(m=2; m<=NumDRows(s->SV); m++) s->w[n] += s->alpha[m]*s->SV[m][n]; } } /* Close the file */ fclose(fid); /* Return s */ return(s); } void PrintSVMDef(FILE *fid, SVMDef *s) { fprintf(fid," type=%d", s->kerneltype); /* kernel type */ fprintf(fid," b=%g",s->b); /* SVM offset parameter */ fprintf(fid," d=%g",s->g); /* polynomial kernel power parameter */ fprintf(fid," g=%g",s->g); /* RBF kernel inverse-variance parameter */ fprintf(fid," r=%g",s->r); /* poly and tanh kernels: offset parameter */ fprintf(fid," s=%g\n",s->s); /* poly and tanh kernels: scale parameter */ if(s->alpha != NULL) { if(DVectorSize(s->alpha)>0) fprintf(fid," alpha[1]=%g; ",s->alpha[1]); fprintf(fid," alpha size=%d\n",DVectorSize(s->alpha)); } else fprintf(fid," alpha is NULL\n"); if(s->SV != NULL) { if(NumDRows(s->SV)>0 && NumDCols(s->SV)>0) fprintf(fid," SV[1][1]=%g; ",s->SV[1][1]); fprintf(fid, " SV size=(%d,%d)\n",NumDRows(s->SV),NumDCols(s->SV)); } else fprintf(fid," SV is NULL\n"); } /*********************************************/ /* Write to an svmlight-format tokens file */ /* Return the number of support vectors written */ /*********************************************/ int SaveSVMVectors(FILE *fid, SVMDef *s) { int m,n,nrows,ncols; if(s->alpha==NULL || s->SV==NULL) { fprintf(stderr,"SaveSVMVectors called with null pointers; save aborted\n"); return(0); } nrows=NumDRows(s->SV); ncols=NumDCols(s->SV); if(nrows != DVectorSize(s->alpha)) { fprintf(stderr,"SaveSVMVectors: alpha size (%d) and SV count (%d,%d) should match\n",DVectorSize(s->alpha),nrows,ncols); return(0); } /* Print every line */ for(m=1; m<=nrows; m++) { fprintf(fid,"%g",s->alpha[m]); for(n=1; n<=ncols; n++) fprintf(fid," %d:%.8g",n,s->SV[m][n]); fprintf(fid,"\n"); } /* Return the number saved */ return(m-1); } int SaveSVMDef(char *filename, SVMDef *s) { int nsaved=0; FILE *fid; if((fid=fopen(filename,"w"))==NULL) { fprintf(stderr, "SaveSVMDef: Unable to write to file %s\n",filename); perror("SaveSVMDef"); return(0); } fprintf(fid,"SVM-light Version V5.00\n"); fprintf(fid,"%d # kernel type\n",s->kerneltype); fprintf(fid,"%.8g # kernel parameter -d \n",s->d); fprintf(fid,"%.8g # kernel parameter -g \n",s->g); fprintf(fid,"%.8g # kernel parameter -s \n",s->s); fprintf(fid,"%.8g # kernel parameter -r \n",s->r); fprintf(fid,"empty# kernel parameter -u \n"); fprintf(fid,"%d # highest feature index \n",NumDCols(s->SV)); fprintf(fid,"empty# number of training documents \n"); fprintf(fid,"%d # number of support vectors plus 1 \n",NumDRows(s->SV)+1); fprintf(fid,"%.8g # threshold b, each following line is a SV (starting with alpha*y)\n",s->b); if((nsaved=SaveSVMVectors(fid,s)) < NumDRows(s->SV)) fprintf(stderr,"SaveSVMDef Warning: Saved only %d of %d vectors in %s\n",nsaved,NumDRows(s->SV),filename); fclose(fid); return(nsaved); } void CopySVMDef(SVMDef *tgt, SVMDef *src, MemHeap *heap) { if(src->SV != NULL) { tgt->SV=CreateDMatrix(heap, NumDRows(src->SV), NumDCols(src->SV)); CopyDMatrix(src->SV, tgt->SV); } if(src->alpha != NULL) { tgt->alpha=CreateDVector(heap, DVectorSize(src->alpha)); CopyDVector(src->alpha, tgt->alpha); } if(src->w != NULL) { tgt->w = CreateDVector(heap, DVectorSize(src->w)); CopyDVector(src->w,tgt->w); } tgt->kerneltype=src->kerneltype; tgt->b=src->b; tgt->d=src->d; tgt->g=src->g; tgt->r=src->r; tgt->s=src->s; } /* Zero HParm */ void ZeroHParm(HParm *h) { h->nrows=0; h->sampPeriod=0; h->ncols=0; h->HTKCode=0; h->X=NULL; } /* Copy HParm Header only */ void CopyHParmHeader(HParm *src, HParm *tgt) { tgt->nrows=src->nrows; tgt->sampPeriod=src->sampPeriod; tgt->ncols=src->ncols; tgt->HTKCode=src->HTKCode; } /* Copy content of an HParm to an SVMDef */ void HParm2SVMDef(MemHeap *heap, HParm *h, SVMDef *s, int *sampPeriod, short *HTKCode) { ZeroSVMDef(s); if(h->X != NULL) { ZeroDVector(s->alpha = CreateDVector(heap, NumDRows(h->X))); CopyDMatrix(h->X, (s->SV = CreateDMatrix(heap, NumDRows(h->X), NumDCols(h->X)))); } if(sampPeriod!=NULL) *sampPeriod=h->sampPeriod; if(HTKCode!=NULL) *HTKCode=h->HTKCode; } /* Copy content of an SVMDef to an HParm */ void SVMDef2HParm(MemHeap *heap, SVMDef *s, HParm *h, int *sampPeriod, short *HTKCode) { ZeroHParm(h); if(s->SV != NULL) { CopyDMatrix(s->SV, (h->X = CreateDMatrix(heap, NumDRows(s->SV), NumDCols(s->SV)))); } if(sampPeriod!=NULL) h->sampPeriod=*sampPeriod; if(HTKCode!=NULL) h->HTKCode=*HTKCode; } /* Read HParm Header into h */ /* Returns h if successful, null otherwise */ HParm *ReadHParmHeader(FILE *fid, HParm *h) { int nSamp; short sampSize; /* Read the basic HTK Header */ if(fread_bigendian(&nSamp, sizeof(int), 1, fid) < 1) { fprintf(stderr,"ReadHParmHeader: Unable to read\n"); return(NULL); } if(fread_bigendian(&(h->sampPeriod), sizeof(int), 1, fid) < 1) { fprintf(stderr,"ReadHParmHeader: Unable to read\n"); return(NULL); } if(fread_bigendian(&sampSize, sizeof(short), 1, fid) < 1) { fprintf(stderr,"ReadHParmHeader: Unable to read\n"); return(NULL); } if(fread_bigendian(&(h->HTKCode), sizeof(short), 1, fid) < 1) { fprintf(stderr,"ReadHParmHeader: Unable to read\n"); return(NULL); } /* ncols = sampSize/2 if short data, otherwise sampSize/4 */ if(h->HTKCode & HASCOMPX) h->ncols = sampSize/2; else if(h->HTKCode==0) h->ncols = sampSize/2; else h->ncols = sampSize/4; /* nrows = nSamp if uncompressed, nSamp-4 if compressed */ if(h->HTKCode & HASCOMPX) h->nrows = nSamp-4; else h->nrows = nSamp; return(h); } /* Write HParm Header from h */ void WriteHParmHeader(FILE *fid, HParm *h) { int nSamp; short sampSize; /* Make sure that h->ncols and h->nrows are correct */ if(h->X != NULL) { if(h->ncols != NumDCols(h->X)) { fprintf(stderr,"WriteHParmHeader: changing column count from %d to %d\n", h->ncols,NumDCols(h->X)); h->ncols=NumDCols(h->X); } if(h->nrows != NumDRows(h->X)) { fprintf(stderr,"WriteHParmHeader: changing column count from %d to %d\n", h->nrows,NumDRows(h->X)); h->ncols=NumDRows(h->X); } } /* ncols = sampSize/2 if short data, otherwise sampSize/4 */ if(h->HTKCode & HASCOMPX) sampSize = 2 * h->ncols; else if(h->HTKCode==0) sampSize = 2 * h->ncols; else sampSize = 4 * h->ncols; /* nrows = nSamp if uncompressed, nSamp-4 if compressed */ if(h->HTKCode & HASCOMPX) nSamp = 4 + h->nrows; else nSamp = h->nrows; fwrite_bigendian(&nSamp, sizeof(int), 1, fid); fwrite_bigendian(&(h->sampPeriod), sizeof(int), 1, fid); fwrite_bigendian(&sampSize, sizeof(short), 1, fid); fwrite_bigendian(&(h->HTKCode), sizeof(short), 1, fid); } /* Read HTK Data Structure */ /* Returns h if successful, NULL otherwise */ HParm *ReadHParm(MemHeap *heap, char *filename, HParm *h, int trace) { FILE *fid; struct utsname arch; Boolean swap=FALSE; Vector A, B, FTMP; ShortVec STMP; int m,n; ZeroHParm(h); if((fid=fopen(filename, "r"))==NULL) { fprintf(stderr,"ReadHParm: Unable to read from %s\n",filename); perror("ReadHParm"); } else { /* Check endian-ness */ if(ReadHParmHeader(fid, h) == NULL) { fprintf(stderr,"ReadHParm: Unable to read from %s\n",filename); fclose(fid); return(NULL); } /* Check for the "compressed" flag */ if(h->HTKCode & HASCOMPX) { if(trace & T_IO) fprintf(stderr,"ReadHParm: %dx%d, period %d, compressed, from %s\n",h->nrows,h->ncols,h->sampPeriod,filename); /* Read the compression parameters */ A = CreateVector(heap,h->ncols); fread_bigendian(&(A[1]), sizeof(float), h->ncols, fid); B = CreateVector(heap,h->ncols); fread_bigendian(&(B[1]), sizeof(float), h->ncols, fid); /* Read and uncompress the data */ STMP = CreateShortVec(heap, h->ncols); h->X = CreateDMatrix(heap, h->nrows, h->ncols); for(m=1; m<=h->nrows; m++) { fread_bigendian(&(STMP[1]), sizeof(short), h->ncols, fid); for(n=1; n<=h->ncols; n++) h->X[m][n] = ((double)STMP[n]+B[n])/A[n]; } } else if (h->HTKCode == 0) { /* Waveform data: read short integers from the file */ if(trace & T_IO) fprintf(stderr,"ReadHParm: %dx%d, period %d, short integers, from %s\n", h->nrows,h->ncols,h->sampPeriod, filename); STMP = CreateShortVec(heap, h->ncols); h->X = CreateDMatrix(heap, h->nrows, h->ncols); for(m=1; m<=h->nrows; m++) { fread_bigendian(&(STMP[1]), sizeof(short), h->ncols, fid); for(n=1; n<=h->ncols; n++) h->X[m][n] = (double)STMP[n]; } } else { /* Any other feature type: read floating point data */ if(trace & T_IO) fprintf(stderr,"ReadHParm: %dx%d, period %d, floating point, from %s\n", h->nrows,h->ncols,h->sampPeriod, filename); FTMP = CreateVector(heap, h->ncols); h->X = CreateDMatrix(heap, h->nrows, h->ncols); for(m=1; m<=h->nrows; m++) { fread_bigendian(&(FTMP[1]), sizeof(float), h->ncols, fid); for(n=1; n<=h->ncols; n++) h->X[m][n] = (double)FTMP[n]; } } } fclose(fid); return(h); } /* Write HTK Data Structure */ /* Return the number of frames written */ int WriteHParm(MemHeap *heap, char *filename, HParm *h, int trace) { FILE *fid; struct utsname arch; Boolean swap=FALSE; int nSamp, NCOFS; Vector A, B, FTMP; DVector xmax, xmin; ShortVec STMP; int m=1,n; if((fid=fopen(filename,"w"))==NULL) { fprintf(stderr,"WriteHParm: Unable to write to %s\n",filename); perror("WriteHParm"); } else { /* Set the number of samples and h->ncols according to the size of the data matrix; ignore previous values */ h->nrows=NumDRows(h->X); h->ncols=NumDCols(h->X); /* Clear the CRCC bit. WriteHParm doesn't do CRCC yet */ if(h->HTKCode & HASCRCC) { if(trace & T_IO) fprintf(stderr,"WriteHParm: HTK CRC not supported; will clear the CRC bit\n"); h->HTKCode &= (~HASCRCC); } /* Check for the "compressed" flag */ if(h->HTKCode & HASCOMPX) { if(trace & T_IO) fprintf(stderr,"WriteHParm: Writing %d frames, dim %d, compressed, to %s\n",h->nrows,h->ncols,filename); WriteHParmHeader(fid, h); /* Create and write the compression parameters */ DMatrixMax(h->X, (xmax=CreateDVector(heap, NumDCols(h->X)))); DMatrixMin(h->X, (xmin=CreateDVector(heap, NumDCols(h->X)))); A = CreateVector(heap, h->ncols); B = CreateVector(heap, h->ncols); for(n=1; n<=h->ncols; n++) { if (xmax[n] != xmin[n]) A[n] = 2*32767./(xmax[n]-xmin[n]); else A[n] = 1; } for(n=1; n<=h->ncols; n++) { if (xmax[n] != xmin[n]) B[n] = (xmax[n]+xmin[n])*32767 / (xmax[n]-xmin[n]); else B[n] = 0; } fwrite_bigendian(&(A[1]), sizeof(float), h->ncols, fid); fwrite_bigendian(&(B[1]), sizeof(float), h->ncols, fid); if(trace & T_VERBOSE) fprintf(stderr,"Writing to %s a matrix of size %d,%d with sampPeriod %d, HTKCode %d\n", filename,NumDRows(h->X),NumDCols(h->X),h->sampPeriod,h->HTKCode); /* Write compressed data */ STMP = CreateShortVec(heap, h->ncols); for(m=1; m<=h->nrows; m++) { for(n=1; n<=h->ncols; n++) STMP[n] = (short) (h->X[m][n] * A[n] - B[n]); if(trace & T_VERBOSE) { for(n=1; n<=h->ncols; n++) fprintf(stderr,"(%d,%d):%g(%g,%d,%g,%g) ",m,n,h->X[m][n], (STMP[n]+B[n])/A[n],STMP[n],A[n],B[n]); fprintf(stderr,"\n"); } fwrite_bigendian(&(STMP[1]), sizeof(short), h->ncols, fid); } if(trace & T_VERBOSE) fprintf(stderr,"\n"); } else { if(trace & T_IO) fprintf(stderr,"WriteHParm: Writing %d frames, dim %d, uncompressed, to %s\n", h->nrows,h->ncols,filename); WriteHParmHeader(fid, h); if(trace & T_VERBOSE) fprintf(stderr,"Writing to %s a matrix of size %d,%d with sampPeriod %d, HTKCode %d\n", filename,NumDRows(h->X),NumDCols(h->X),h->sampPeriod,h->HTKCode); /* Write floating point data */ FTMP = CreateVector(heap, h->ncols); for(m=1; m<=h->nrows; m++) { for(n=1; n<=h->ncols; n++) FTMP[n] = (float)(h->X[m][n]); if(trace & T_VERBOSE) { for(n=1; n<=h->ncols; n++) fprintf(stderr,"(%d,%d):%g(%g) ",m,n,h->X[m][n],FTMP[n]); fprintf(stderr,"\n"); } fwrite_bigendian(&(FTMP[1]), sizeof(float), h->ncols, fid); } if(trace & T_VERBOSE) fprintf(stderr,"\n"); } /* Not COMP_X */ } fclose(fid); return(m-1); } /* -------------------------------------------- */ /* Read an MLF, and append its information to any that already exist in TranscriptLabels */ /* fid = opened MLF */ /* nTranscripts = the number of MLF transcript-file regions already read. Return value is new nTranscripts */ /* */ int read_MLF(FILE *fid, MemHeap *heap, int nTranscripts, char **TranscriptFilenames, int *nLabels, char ***TranscriptLabels, int **TranscriptStartTimes, int **TranscriptEndTimes) { static char line[MAX_LINE]; char *arg, *arg2, *s; int m,n; while(!feof(fid)) { /* Get the next line */ fgets(line, MAX_LINE, fid); /* Determine its treatment based on its first character!! */ switch(line[0]) { case '"': if(nTranscripts>=MAX_FILES) { fprintf(stderr,"Maximum number of transcription files (%d) exceeded\n",MAX_FILES); return(nTranscripts); } arg=strrchr(line,'/')+1; /* Root file starts after last '/' on the line */ if(arg==NULL) arg=strchr(line,'"')+1; /* If no '/' found, go just after first '"' character */ if((s=strchr(arg,'.'))!=NULL) *s='\0'; /* Snip off the extension, if any */ else if((s=strchr(arg,'"'))!=NULL) *s='\0';/* If no extension found, snip off the final " */ /* Increment the number of transcripts */ m=nTranscripts++; /* Add arg to TranscriptFilenames */ TranscriptFilenames[m] = (char *)malloc((strlen(arg)+1)*sizeof(char)); strcpy(TranscriptFilenames[m], arg); nLabels[m]=0; TranscriptLabels[m]=(char **)calloc(MAX_LABELS, sizeof(char *)); TranscriptStartTimes[m]=(int *)calloc(MAX_LABELS, sizeof(int)); TranscriptEndTimes[m]=(int *)calloc(MAX_LABELS, sizeof(int)); /* Increment the number of transcripts */ break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '0': if (nTranscripts > 0) { m=nTranscripts-1; if(nLabels[m]>=MAX_LABELS) fprintf(stderr,"Maximum number of label lines (%d) exceeded in file %s\n",MAX_LABELS,TranscriptFilenames[m]); else { /* Treat this as a "time time label" line; if it doesn't match, skip it */ n = nLabels[m]; TranscriptStartTimes[m][n] = strtol(line, &arg, 10); TranscriptEndTimes[m][n] = strtol(arg, &arg2, 10); if(arg2 == arg || arg == line) fprintf(stderr,"read_MLF: Unable to find start_time end_time label on line: %s\n",line); else { /* Save the times and label */ TranscriptLabels[m][n] = CopyString(heap, arg2); nLabels[m]++; } } } } } return(nTranscripts); }