Logo Search packages:      
Sourcecode: speech-tools version File versions  Download package

viterbi_main.cc

/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                      Copyright (c) 1995,1996                          */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*                 Authors:  Alan W Black and Simon King                 */
/*                 Date   :  January 1997                                */
/*-----------------------------------------------------------------------*/
/*  A simple use of the Viterbi decoder                                  */
/*                                                                       */
/*=======================================================================*/

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "EST.h"

EST_read_status load_TList_of_StrVector(EST_TList<EST_StrVector> &w,
                              const EST_String &filename,
                              const int vec_len);

static void print_results(EST_Relation &wstream);
static bool do_search(EST_Relation &wstream);
static EST_VTPath *vit_npath(EST_VTPath *p,EST_VTCandidate *c,EST_Features &f);
static EST_VTCandidate *vit_candlist(EST_Item *s,EST_Features &f);
static void top_n_candidates(EST_VTCandidate* &all_c);
static void load_vocab(const EST_String &vfile);

static void add_word(EST_Relation &w, const EST_String &word, int pos);

static void load_wstream(const EST_String &filename,
                   const EST_String &vfile,
                   EST_Relation &w,
                   EST_Track &obs);

static void load_given(const EST_String &filename,
                   const int ngram_order);
                   
static double find_gram_prob(EST_VTPath *p,int *state);

// special stuff for non-sliding window ngrams
static double find_extra_gram_prob(EST_VTPath *p,int *state, int time);
static void get_history(EST_StrVector &history, EST_VTPath *p);
static void fill_window(EST_StrVector &window,EST_StrVector &history,
                  EST_VTPath *p,const int time);
static int is_a_special(const EST_String &s, int &val);
static int max_history=0;

static EST_Ngrammar ngram;
static EST_String pstring = SENTENCE_START_MARKER;
static EST_String ppstring = SENTENCE_END_MARKER;
static float lm_scale = 1.0;
static float ob_scale = 1.0;
static float ob_scale2 = 1.0;

// pruning beam widths
static float beam=-1;
static float ob_beam=-1;
static int n_beam = -1;

static bool trace_on = FALSE;

// always logs
static double ob_log_prob_floor = SAFE_LOG_ZERO;  
static double ob_log_prob_floor2 = SAFE_LOG_ZERO;  
static double lm_log_prob_floor = SAFE_LOG_ZERO;  

int btest_debug = FALSE;
static EST_String out_file = "";
static EST_StrList vocab;
static EST_Track observations;  
static EST_Track observations2;  
static EST_TList<EST_StrVector> given; // to do : convert to array for speed
int using_given=FALSE;

// default is that obs are already logs
int take_logs = FALSE;
int num_obs = 1;




/** @name  <command>viterbi</command> <emphasis>Combine n-gram model and likelihoods to estimate posterior probabilities</emphasis>
  * @id viterbi-manual
  * @toc
 */

//@{

/**@name Synopsis
  */
//@{

//@synopsis

/**
viterbi is a simple time-synchronous Viterbi decoder. It finds the
most likely sequence of items drawn from a fixed vocabulary, given
frame-by-frame observation probabilites for each item in that
vocabulary, and a ngram grammar. Possible uses include:

<itemizedlist>
<listitem><para>Simple speech recogniser back end</para></listitem>
</itemizedlist>

viterbi can optionally use two sets of frame-by-frame observation
probabilites in a weighted-sum fashion. Also, the ngram language model
is not restricted to the conventional sliding window type in which the
previous n-1 items are the ngram context. Items in the ngram context
at each frame may be given. In this case, the user must provide a file
containing the ngram context: one (n-1) tuple per line. To include
items from the partial Viterbi path so far (i.e. found at recognition
time, not given) the special notation <-N> is used where N indicates
the distance back to the item required. For example <-1> would
indicate the item on the partial Viterbi path at the last frame. See
\Ref{Examples}.

<formalpara>
<para><title>Pruning</title></para>

<para>
Three types of pruning are available to reduce the size of the search
space and therefore speed up the search:

<itemizedlist>
<listitem><para>Observation pruning</para></listitem>
<listitem><para>Top-N pruning at each frame</para></listitem>
<listitem><para>Fixed width beam pruning</para></listitem>
</itemizedlist>

</para>
</formalpara>

*/

//@}

/**@name Options
  */
//@{

//@options


//@}

int main(int argc, char **argv)
{
    EST_StrList files;
    EST_Option al;
    EST_Relation wstream;
    double floor; // a temporary

    parse_command_line(argc, argv, 
       EST_String("[observations file] -o [output file]\n")+
       "Summary: find the most likely path through a sequence of\n"+
       "         observations, constrained by a language model.\n"+
       "-ngram <string>     Grammar file, required\n"+
       "-given <string>     ngram left contexts, per frame\n"+
       "-vocab <string>     File with names of vocabulary, this\n"+
       "                    must be same number as width of observations, required\n"+
       "-ob_type <string>   Observation type : likelihood .... and change doc\"probs\" or \"logs\" (default is \"logs\")\n"+
       "\nFloor values and scaling (scaling is applied after floor value)\n"+
       "-lm_floor <float>   LM floor probability\n"+
       "-lm_scale <float>   LM scale factor factor (applied to log prob)\n"+
       "-ob_floor <float>   Observations floor probability\n"+
       "-ob_scale <float>   Observation scale factor (applied to prob or log prob, depending on -ob_type)\n\n"+
       "-prev_tag <string>\n"+
       "                 tag before sentence start\n"+
       "-prev_prev_tag <string>\n"+
       "                 all words before 'prev_tag'\n"+
       "-last_tag <string>\n"+
       "                 after sentence end\n"+
       "-default_tags    use default tags of "+SENTENCE_START_MARKER+","
                  SENTENCE_END_MARKER+" and "+SENTENCE_END_MARKER+"\n"+
       "                 respectively\n\n"+

       "-observes2  <string> second observations (overlays first, ob_type must be same)\n"+
       "-ob_floor2 <float>  \n"+
       "-ob_scale2 <float>  \n\n"+
       "-ob_prune  <float> observation pruning beam width (log) probability\n"+
       "-n_prune   <int>   top-n pruning of observations\n"+
       "-prune     <float> pruning beam width (log) probability\n"+
       "-trace             show details of search as it proceeds\n",
                  files, al);

    out_file = al.present("-o") ? al.val("-o") : (EST_String)"-";

    if (files.length() != 1)
      {
      cerr << argv[0];
      cerr << ": you must give exactly one observations file on the command line";
      cerr << endl;
      cerr << "(use -observes2 for optional second observations)" << endl;
      exit(-1);
      }

    if (al.present("-ngram"))
    {
      ngram.load(al.val("-ngram"));
    }
    else
    {
      cerr << argv[0] << ": no ngram specified" << endl;
      exit(-1);
    }

    if(!al.present("-vocab"))
      {
      cerr << "You must provide a vocabulary file !" << endl;
      exit(-1);
      }

    load_wstream(files.first(),al.val("-vocab"),wstream,observations);
    if (al.present("-observes2"))
    {
      load_wstream(al.val("-observes2"),al.val("-vocab"),wstream,observations2);
      num_obs = 2;
    }

    if (al.present("-given"))
    {
      load_given(al.val("-given"),ngram.order());
      using_given=TRUE;
    }

    if (al.present("-lm_scale"))
      lm_scale = al.fval("-lm_scale");
    else
      lm_scale = 1.0;

    if (al.present("-ob_scale"))
      ob_scale = al.fval("-ob_scale");
    else
      ob_scale = 1.0;

    if (al.present("-ob_scale2"))
      ob_scale2 = al.fval("-ob_scale2");
    else
      ob_scale2 = 1.0;

    if (al.present("-prev_tag"))
      pstring = al.val("-prev_tag");
    if (al.present("-prev_prev_tag"))
      ppstring = al.val("-prev_prev_tag");

    // pruning
    if (al.present("-prune"))
      beam = al.fval("-prune");
    else
      beam = -1; // no pruning

    if (al.present("-ob_prune"))
      ob_beam = al.fval("-ob_prune");
    else
      ob_beam = -1; // no pruning

    if (al.present("-n_prune"))
    {
      n_beam = al.ival("-n_prune");
      if(n_beam <= 0)
      {
          cerr << "WARNING : " << n_beam;
          cerr << " is not a reasonable value for -n_prune !" << endl;
      }
    }
    else
      n_beam = -1; // no pruning
    


    if (al.present("-trace"))
      trace_on = TRUE;

    // language model floor
    if (al.present("-lm_floor"))
    {
      floor = al.fval("-lm_floor");
      if(floor < 0)
      {
          cerr << "Error : LM floor probability is negative !" << endl;
          exit(-1);
      }
      else if(floor > 1)
      {
          cerr << "Error : LM floor probability > 1 " << endl;
          exit(-1);
      }
      lm_log_prob_floor = safe_log(floor);
    }

    // observations floor
    if (al.present("-ob_floor"))
    {
      floor = al.fval("-ob_floor");
      if(floor < 0)
      {
          cerr << "Error : Observation floor probability is negative !" << endl;
          exit(-1);
      }
      else if(floor > 1)
      {
          cerr << "Error : Observation floor probability > 1 " << endl;
          exit(-1);
      }
      ob_log_prob_floor = safe_log(floor);
    }

    if (al.present("-ob_floor2"))
    {
      floor = al.fval("-ob_floor2");
      if(floor < 0)
      {
          cerr << "Error : Observation2 floor probability is negative !" << endl;
          exit(-1);
      }
      else if(floor > 1)
      {
          cerr << "Error : Observation2 floor probability > 1 " << endl;
          exit(-1);
      }
      ob_log_prob_floor2 = safe_log(floor);
    }
    

    if (al.present("-ob_type"))
    {
      if(al.val("-ob_type") == "logs")
          take_logs = false;
      else if(al.val("-ob_type") == "probs")
          take_logs = true;
      else
      {
          cerr << "\"" << al.val("-ob_type") 
            << "\" is not a valid ob_type : try \"logs\" or \"probs\"" << endl;
          exit(-1);
      }
    }

    if(do_search(wstream))
      print_results(wstream);
    else
      cerr << "No path could be found." << endl;

    return 0;
}

static void print_results(EST_Relation &wstream)
{
    EST_Item *s;
    float pscore;
    EST_String predict;
    FILE *fd;

    if (out_file == "-")
      fd = stdout;
    else if ((fd = fopen(out_file,"wb")) == NULL)
    {
      cerr << "can't open \"" << out_file << "\" for output" << endl;
      exit(-1);
    }

    for (s=wstream.head(); s != 0 ; s=next(s))
    {
      predict = s->f("best").string();
      pscore = s->f("best_score");
      fprintf(fd,"%s %f\n",(const char *)predict,pscore);
    }

    if (out_file != "")
      fclose(fd);

}

static bool do_search(EST_Relation &wstream)
{
    // Apply Ngram to matrix of probs 
    int states;

    states = ngram.num_states();
    EST_Viterbi_Decoder vc(vit_candlist,vit_npath,states);

    vc.initialise(&wstream);

    if((beam > 0) || (ob_beam > 0))
      vc.set_pruning_parameters(beam,ob_beam);

    if(trace_on)
    {
      vc.turn_on_trace();
      cerr << "Starting Viterbi search..." << endl;
    }

    vc.search();

    return vc.result("best");  // adds fields to w with best values 

}

static void load_wstream(const EST_String &filename,
                   const EST_String &vfile, 
                   EST_Relation &w,
                   EST_Track &obs)
{
    // Load in vocab and probs into Stream (this isn't general)
    EST_String word, pos;
    int i=-1;

    if(vocab.empty())
      load_vocab(vfile);

    if (obs.load(filename,0.10) != 0)
    {
      cerr << "can't find observations file \"" << filename << "\"" << endl;
      exit(-1);
    }

    if (vocab.length() != obs.num_channels())
    {
      cerr << "Number in vocab (" << vocab.length() << 
          ") not equal to observation's width (" <<
            obs.num_channels() << ")" << endl;
      exit(-1);
    }
      
    if(w.empty())
      for (i=0; i < obs.num_frames(); i++)
          add_word(w,itoString(i),i);
}


static void load_given(const EST_String &filename,
                   const int ngram_order)
{

    EST_String word, pos;
    EST_Litem *p;
    int i,j;

    if (load_TList_of_StrVector(given,filename,ngram_order-1) != 0)
    {
      cerr << "can't load given file \"" << filename << "\"" << endl;
      exit(-1);
    }

    // set max history
    for (p = given.head(); p; p = next(p))
    {
      for(i=0;i<given(p).length();i++)
          if(     is_a_special( given(p)(i), j) && (-j > max_history))
            max_history = -j;
      
    }
    
}

static void load_vocab(const EST_String &vfile)
{
    // Load vocabulary (strings)
    EST_TokenStream ts;

    if (ts.open(vfile) == -1)
    {
      cerr << "can't find vocab file \"" << vfile << "\"" << endl;
      exit(-1);
    }

    while (!ts.eof())
      if (ts.peek() != "")
          vocab.append(ts.get().string());

    ts.close();
}

static void add_word(EST_Relation &w, const EST_String &word, int pos)
{
    EST_Item *item = w.append();
    
    item->set_name(word);
    item->set("pos",pos);
} 

static EST_VTCandidate *vit_candlist(EST_Item *s,EST_Features &f)
{
    // Return a list of new candidates from this point 
    double prob=1.0,prob2=1.0;
    int i;
    EST_Litem *p;
    int observe;
    EST_VTCandidate *all_c = 0;
    EST_VTCandidate *c;
    (void)f;

    observe = s->f("pos");  // index for observations TRACK
    for (i=0,p=vocab.head(); i < observations.num_channels(); i++,p=next(p))
    {
      c = new EST_VTCandidate;
      c->name = vocab(p);  // to be more efficient this could be the index
      prob = observations.a(observe,i);
      if(num_obs == 2)
          prob2 = observations2.a(observe,i);

      if(take_logs)
      {
          prob = safe_log10(prob);
          if (prob < ob_log_prob_floor)
            prob = ob_log_prob_floor;

          if(num_obs == 2)
          {
            prob2 = safe_log10(prob2);
            if (prob2 < ob_log_prob_floor2)
                prob2 = ob_log_prob_floor2;
          }
      }
      else // already in logs
      {
          if (prob < ob_log_prob_floor)
            prob = ob_log_prob_floor;
          if ((num_obs == 2) && (prob2 < ob_log_prob_floor2))
            prob2 = ob_log_prob_floor2;
      }

      prob *= ob_scale;
      prob2 *= ob_scale2;

      if(num_obs == 2)
          c->score = prob + prob2;
      else
          c->score = prob;

      c->next = all_c;
      c->s = s;
      all_c = c;
    }

    if(n_beam > 0)
    {
      // N.B. this might be very time-consuming
      top_n_candidates(all_c);
    }

    return all_c;
}

static EST_VTPath *vit_npath(EST_VTPath *p,EST_VTCandidate *c,
                       EST_Features &f)
{
    // Build a (potential) new path link from this previous path and 
    // This candidate 
    EST_VTPath *np = new EST_VTPath;
    double lprob,prob;
    EST_String prev,ttt;
    (void)f;

    np->c = c;
    np->from = p;

    // are we using extra info ?
    if(using_given)
      // time of candidate is
      // c->s->f("pos");
      prob = find_extra_gram_prob(np,&np->state,c->s->f("pos"));
    else
      prob = find_gram_prob(np,&np->state);

    lprob = safe_log10(prob);
    if (lprob < lm_log_prob_floor)
      lprob =     lm_log_prob_floor;

    lprob *= lm_scale;

    np->f.set("lscore",(c->score+lprob)); // simonk : changed prob to lprob
    if (p==0)
      np->score = (c->score+lprob);
    else
      np->score = (c->score+lprob) + p->score;

    return np;
}

static double find_gram_prob(EST_VTPath *p,int *state)
{
    // Look up transition probability from *state for name.
    // Return probability and update state
    double prob=0.0,nprob;
    int i,f=FALSE;
    EST_VTPath *pp;
    
    EST_StrVector window(ngram.order());
    for (pp=p->from,i=ngram.order()-2; i >= 0; i--)
    {
      if (pp != 0)
      {
          window[i] = pp->c->name.string();
          pp = pp->from;
      }
      else if (f)
          window[i] = ppstring;
      else
      {
          window[i] = pstring;
          f = TRUE;
      }
    }
    window[ngram.order()-1] = p->c->name.string();
    const EST_DiscreteProbDistribution &pd = ngram.prob_dist(window);
    if (pd.samples() == 0)
      prob = 0;
    else
      prob = (double)pd.probability(p->c->name.string());
    
    for (i=0; i < ngram.order()-1; i++)
      window[i] = window(i+1);
    ngram.predict(window,&nprob,state);

    return prob;
}


static double find_extra_gram_prob(EST_VTPath *p,int *state,int time)
{

    int i;
    double prob=0.0,nprob;
    EST_StrVector window(ngram.order());
    EST_StrVector history(max_history);

    get_history(history,p);

    fill_window(window,history,p,time);

    /*
    cerr << "Looking up ngram ";
    for(i=0;i<window.num_points();i++)
      cerr << window(i) << " ";
    cerr << endl;
    */

    const EST_DiscreteProbDistribution &pd = ngram.prob_dist(window);
    if (pd.samples() == 0)
      prob = 0;
    else
      prob = (double)pd.probability(p->c->name.string());

    // shift history, adding latest item at 'end' (0)
    if(max_history>0)
    {
      for(i=history.length()-1;i>0;i--)
          history[i] = history(i-1);
      history[0] = p->c->name.string();
    }

    fill_window(window,history,p,time+1);
    ngram.predict(window,&nprob,state);

    //cerr << endl << endl;

    return prob;

}

static void get_history(EST_StrVector &history, EST_VTPath *p)
{

    EST_VTPath *pp;
    int i,f=FALSE;
    for (pp=p->from,i=0; i < history.length(); i++)
    {
      
      if (pp != 0)
      {
          history[i] = pp->c->name.string();
          pp = pp->from;
      }
      else if (f)
          history[i] = ppstring;
      else
      {
          history[i] = pstring;
          f = TRUE;
      }
    }

}

static void fill_window(EST_StrVector &window,EST_StrVector &history,
                  EST_VTPath *p,const int time)
{
    // Look up transition probability from *state for name.
    // Return probability and update state
    int i,j;
    EST_String s;

    // can we even do it?
    if( time >= given.length() )
      return;

    // format should be run-time defined, but try this for now
    // first n-1 things in window come from 'given'
    // last one is predictee

    // also want vocab and grammar mismatch allowed !!!!!!

    // predictee
    window[ngram.order()-1] = p->c->name.string();

    // given info for this time
    EST_StrVector *this_g = &(given.nth(time)); // inefficient to count down a list


    for(i=0;i<ngram.order()-1;i++)
    {

      if( is_a_special( (*this_g)(i), j))
          window[i] = history(-1-j); // j=-1 -> (0)   j=-2 -> (1)   etc.
      else
          window[i] = (*this_g)(i);
    }
}



static int is_a_special(const EST_String &s, int &val)
{

    // special is "<int>"

    EST_String tmp;
    if(s.contains("<") && s.contains(">"))
    {
      tmp = s.after("<");
      tmp = tmp.before(">");
      val = atoi(tmp);
      //cerr << "special " << tmp << "=" << val << endl;
      return TRUE;
    }
    return FALSE;
}

static void top_n_candidates(EST_VTCandidate* &all_c)
{
    // keep the n most likely candidates
    // avoiding a full sort of the (potentially long) list

    EST_VTCandidate *top_c=NULL,*p,*q,*this_best, *prev_to_best;
    int i;
    if(n_beam < 1)
      return; // do nothing

    // here we assume big is always good
    //if(!big_is_good)
    //score_multiplier = -1;

    for(i=0;i<n_beam;i++)
    {

      // head of the list is best guess
      this_best=all_c;
      prev_to_best=NULL;

      // find best candidate in all_c
      q=NULL;;
      for(p=all_c;p!= NULL;q=p,p=p->next)
      {
          //cerr << "item : " << p->score << endl;
          if(p->score > this_best->score)
          {
            this_best = p;
            prev_to_best=q;
          }
      }
      
      if(this_best == NULL)
          break; // give up now - must have run out of candidates

      // move best candidate over to new list
      if(prev_to_best == NULL)
          // best was head of list
          all_c = this_best->next;
      else
          // best was not head of all_c
          prev_to_best->next = this_best->next;

          this_best->next = top_c;
          top_c = this_best;
    }

    delete all_c;
    all_c = top_c;

/*
    cerr << "Here are the top " << n_beam << " candidates" << endl;
    for(p=all_c;p != NULL;p=p->next)
      cerr << p->score << endl;
*/
}


/**@name Examples

Example 'given' file (items f and g are in the vocabulary), the ngram
is a 4-gram.

<para>
<screen>
<-2> g g
<-1> g f
<-1> f g
<-2> g g
<-3> g g
<-1> g f
</screen>
</para>

*/
//@{
//@}



//@}

Generated by  Doxygen 1.6.0   Back to index