Logo Search packages:      
Sourcecode: speech-tools version File versions  Download package

EST_Ngrammar.cc

/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                         Copyright (c) 1996                            */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*                     Author :  Simon King & Alan W Black               */
/*                     Date   :  February 1997                           */
/*-----------------------------------------------------------------------*/
/*                                                                       */
/* An EST_Ngram class for building and manipulating bigrams trigrams etc */
/*                                                                       */
/*=======================================================================*/
#include <iostream.h>
#include <fstream.h>
#include <string.h>
#include <math.h>
#include <limits.h>
#include <float.h>
#include "EST_Ngrammar.h"
#include "EST_Pathname.h"
#include "EST_Token.h"
#include "EST_io_aux.h"

const EST_DiscreteProbDistribution PSTnullProbDistribution;
static EST_String NOVOCAB("NOVOCAB");

// **********************************************************************

EST_NgrammarState::EST_NgrammarState(const EST_NgrammarState &s)
{
    clear();
    init(s.id(),s.pdf_const());
}

EST_NgrammarState::EST_NgrammarState(const EST_NgrammarState *const s)
{
    clear();
    init(s->id(),s->pdf_const()); // copy
}

EST_NgrammarState::~EST_NgrammarState()
{
    clear();
}

void EST_NgrammarState::clear()
{
    p_id = -1;
    p_pdf.clear();
}

void EST_NgrammarState::init()
{
    p_id=-1;
    p_pdf.init();
}

void EST_NgrammarState::init(int id,EST_Discrete *d)
{
    p_id = id;
    p_pdf.init(d);
}


void EST_NgrammarState::init(int id, const EST_DiscreteProbDistribution &pdf)
{
    p_id = id;
    p_pdf = pdf; // copy it
}


ostream& operator<<(ostream& s, const EST_NgrammarState &a)
{
    s << "(" << a.id() << ": " << a.pdf_const() << " )";
    return s;
}

// **********************************************************************

EST_BackoffNgrammarState::~EST_BackoffNgrammarState()
{
    p_pdf.clear();
    children.clear();
}


void EST_BackoffNgrammarState::clear()
{
    backoff_weight=0;
    p_pdf.clear();
}

void EST_BackoffNgrammarState::init()
{
    backoff_weight=0;
    p_pdf.init();
}

void EST_BackoffNgrammarState::init(const EST_Discrete *d,int level)
{
    backoff_weight=0;
    p_level = level;
    p_pdf.init(d);
}

void EST_BackoffNgrammarState::init(const EST_DiscreteProbDistribution &pdf, int level)
{
    backoff_weight=0;
    p_level = level;
    p_pdf = pdf; // copy it
}

bool EST_BackoffNgrammarState::accumulate(const EST_StrVector &words,
                                const double count)
{

//    int i;
//    cerr << "accumulate ";
//    for(i=0;i<words.n();i++)
//    {
//    if(words.n()-1-p_level == i)
//        cerr <<"*";
//    cerr << words(i);
//    if(words.n()-1-p_level == i)
//        cerr <<"*";
//    }
//    cerr << endl;

    // update pdf at this node
    p_pdf.cumulate(words(words.n()-1-p_level),count);
    
    // then go down
    if (words.n()-1-p_level > 0) // not at bottom of tree
    {

      EST_BackoffNgrammarState *s;
      
      // have we got the child
      s = get_child(words(words.n()-1-p_level));
      if (s==NULL)
          // have to extend the tree
          s = add_child(p_pdf.get_discrete(),words);
      return s->accumulate(words,count);

    }
    else
    {
      return true;
    }
}

bool EST_BackoffNgrammarState::accumulate(const EST_IVector &words,
                                const double count)
{

//    int i;
//    cerr << "accumulate level " << p_level << " : ";
//    for(i=0;i<words.n();i++)
//    {
//    if(words.n()-1-p_level == i)
//        cerr <<"*";
//    cerr << p_pdf.item_name(words(i));
//    if(words.n()-1-p_level == i)
//        cerr <<"*";
//    cerr << " ";
//    }
//    cerr << endl;

    // update pdf at this node
    p_pdf.cumulate(words(words.n()-1-p_level),count);
    
    //cerr << *this << endl;
    //cerr << endl;

    // then go down
    if (words.n()-1-p_level > 0) // not at bottom of tree
    {

      EST_BackoffNgrammarState *s;
      
      // have we got the child
      s = get_child(words(words.n()-1-p_level));
      if (s==NULL)
          // have to extend the tree
          s = add_child(p_pdf.get_discrete(),words);

      // get pointer again in case we built more than one level
      s = get_child(words(words.n()-1-p_level));
      if (s==NULL)
      {
          cerr << "Failed to extend tree - unknown reason !" << endl;
          return false;
      }
      return s->accumulate(words,count);
    }
    else
    {
      return true;
    }
}


EST_BackoffNgrammarState*
EST_BackoffNgrammarState::add_child(const EST_Discrete *d,
                            const EST_StrVector &words)
{
    EST_BackoffNgrammarState *s;
    
    if (words.n()-1-p_level > 0) // more history still to go
    {
      // see if we can go down the tree
      s = get_child(words(words.n()-1-p_level));
      if (s != NULL)
          return s->add_child(d,words);
      else
      { 
          // construct tree as we go
          EST_BackoffNgrammarState *new_child = new EST_BackoffNgrammarState;
          new_child->init(d,p_level+1);
          children.add(words(words.n()-1-p_level), (void*)new_child);
          return new_child->add_child(d,words);
      }
    }
    else
    {
      // this is the node we are trying to add !
      return this;
    }
}



EST_BackoffNgrammarState*
EST_BackoffNgrammarState::add_child(const EST_Discrete *d,
                            const EST_IVector &words)
{
    EST_BackoffNgrammarState *s;
    
    if (words.n()-1-p_level > 0) // more history still to go
    {
      // see if we can go down the tree
      s = get_child(words(words.n()-1-p_level));
      if (s != NULL)
          return s->add_child(d,words);
      else
      { 
          // construct tree as we go
          EST_BackoffNgrammarState *new_child = new EST_BackoffNgrammarState;
          new_child->init(d,p_level+1);
          children.add(p_pdf.get_discrete()->name(words(words.n()-1-p_level)), (void*)new_child);
          return new_child->add_child(d,words);
      }
    }
    else
    {
      // this is the node we are trying to add !
      return this;
    }
}

void EST_BackoffNgrammarState::remove_child(EST_BackoffNgrammarState *child,
                                  const EST_String &name)
{

    child->zap();
    // can't remove from StringTrie, but can set pointer to NULL
    children.add(name,NULL);
    delete child;
}

void EST_BackoffNgrammarState::print_freqs(ostream &os, 
                                 const int order, 
                                 EST_String followers)
{
    // not right - just print out, then recurse through children
    // change to use 'backoff_traverse'
    
    int k;
    double freq;
    EST_String name;
    for (k=p_pdf.item_start();
       !p_pdf.item_end(k);
       k = p_pdf.item_next(k))
    {
      p_pdf.item_freq(k,name,freq);
      EST_BackoffNgrammarState *s = ((EST_BackoffNgrammarState*)(children.lookup(name)));
      if (p_level==order-1)
      {
          if(freq>0)
            os << name << " " << followers
                << ": " << freq << endl;
      }
      else if (s!=NULL)
          s->print_freqs(os,order,name+" "+followers);
      
    }
}

bool EST_BackoffNgrammarState::ngram_exists(const EST_StrVector &words,
                                  const double threshold) const
{
    const EST_BackoffNgrammarState *s;
    s = get_state(words);
    if (s != NULL)
    {
      // return true for unigrams (state level 0)
      // even if freq < threshold
      return (bool)((s->level()==0) || 
                  ( s->frequency(words(0)) > threshold ));
    }
    else
      return false;
}

const EST_BackoffNgrammarState *const
EST_BackoffNgrammarState::get_state(const EST_StrVector &words) const
{
    EST_BackoffNgrammarState *s;
    if (words.n()-1-p_level > 0) // more history still to go
      {
        s = get_child(words(words.n()-1-p_level));
        if (s != NULL)
        {
            //cerr << "traversing from " << *this << endl;
            //cerr << " to " << *s << endl << endl;
            return s->get_state(words);
        }
        else
        {
            //cerr << "got NULL" << endl;
            return NULL;
        }
      }
    else
    {
      //cerr << "got " << *this << endl;
      return this;
    }
}

void EST_BackoffNgrammarState::zap()
{

    // recursively delete this state and all its children
    int k;
    double freq;
    EST_String name;
    for (k=p_pdf.item_start();
       !p_pdf.item_end(k);
       k = p_pdf.item_next(k))
    {
      p_pdf.item_freq(k,name,freq);
      EST_BackoffNgrammarState *child = get_child(name);
      if (child!=NULL) 
          remove_child(child,name);
    }

    children.clear();
    p_pdf.clear();

}


const double EST_BackoffNgrammarState::get_backoff_weight(const EST_StrVector &words) const
{
    EST_BackoffNgrammarState *s;
    if (words.n()-1-p_level >= 0)
    {
      s = get_child(words(words.n()-1-p_level));
      if (s != NULL)
          return s->get_backoff_weight(words);
      else
      {
          // if there is no node, the weight would have been 1 anyway
          //cerr << "Couldn't get weight for " << words << endl;
          return 1;           // default
      }
    }
    
    else
    {
      // reached node
      
/*
      cerr << "got bw for ";
      for(int i=0;i<words.n();i++)
          cerr << words(i) << " ";
      cerr << " at level " << p_level << " = " << backoff_weight << endl;
*/    
      return backoff_weight;
    }
}


bool EST_BackoffNgrammarState::set_backoff_weight(const EST_StrVector &words, const double w)
{
    EST_BackoffNgrammarState *s;
    if (words.n()-1-p_level >= 0)
    {
      s = get_child(words(words.n()-1-p_level));
      if (s != NULL)
          return s->set_backoff_weight(words,w);
      else
      {
          // we can't set the weight because the node
          // doesn't exist - this is an error if the weight
          // is not 1
          if (w != 1)
          {
            cerr << "Couldn't set weight for " << words 
                << " to " << w << endl;
            return false;
          }
          else
            return true;      // a weight of 1 does not need storing
      }
    }
    else
    {
      // reached node
      backoff_weight = w;
      return true;
    }
}

void EST_BackoffNgrammarState::frequency_of_frequencies(EST_DVector &ff)
{
    int k,max=ff.n();
    double freq;
    EST_String name;
    for (k=p_pdf.item_start();
       !p_pdf.item_end(k);
       k = p_pdf.item_next(k))
    {
      p_pdf.item_freq(k,name,freq);
      if(freq < max)
          ff[(int)(freq+0.5)] += 1;
    }
}

ostream& operator<<(ostream& s, const EST_BackoffNgrammarState &a)
{
    s << "(backoff level:" << a.p_level
      << " weight:" << a.backoff_weight << " " << a.pdf_const() << " )";
    
    return s;
}

// **********************************************************************

void EST_Ngrammar::default_values()
{
    p_representation = EST_Ngrammar::sparse;
    p_entry_type = EST_Ngrammar::frequencies; // by default
    sparse_representation.clear();
    allow_oov=false;
    p_num_samples = 0;
    p_num_states = 0;
    p_states = 0;
    vocab = 0;
    pred_vocab = 0;
    backoff_threshold = 1.0;
    backoff_unigram_floor_freq = 0.0;
}

EST_Ngrammar::~EST_Ngrammar()
{
    delete [] p_states;
}

void EST_Ngrammar::clear()
{
    // to do
}

bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r, 
                  const EST_StrList &wordlist)
{
    return (bool)(init_vocab(wordlist) && p_init(o,r));
}

bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r, 
                  const EST_StrList &wordlist,
                  const EST_StrList &predlist)
{
    return (bool)(init_vocab(wordlist,predlist) && p_init(o,r));
}

bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r, 
                  EST_Discrete &v)
{
    vocab = &v;
    pred_vocab = vocab;
    vocab_pdf.init(pred_vocab);
    return p_init(o,r);
}

bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r, 
                  EST_Discrete &v, EST_Discrete &pv)
{
    vocab = &v;
    pred_vocab = &pv;
    vocab_pdf.init(pred_vocab);
    return p_init(o,r);
}

bool EST_Ngrammar::p_init(int o, EST_Ngrammar::representation_t r)
{
    if (o <= 0)
    {
      cerr << "EST_Ngrammar order must be > 0" << endl;
      return false;
    }
    
    p_order = o;
    p_representation = r;
    p_number_of_sentences = 0;
    
    switch(p_representation)
    {
      
    case EST_Ngrammar::sparse:
      sparse_representation.init(p_order);
      return true;
      break;
      
    case EST_Ngrammar::dense:
      return init_dense_representation();
      break;
      
    case EST_Ngrammar::backoff:
      return init_backoff_representation();
      break;
      
    default:
      cerr << "Unknown internal representation requested for EST_Ngrammar"
          << endl;
      return false;
      break;
    }
}

bool EST_Ngrammar::init_dense_representation()
{
    // allocate a flattened  N-dimensional matrix of states
    int i;
    
    if (vocab->length() <= 0)
    {
      cerr << "EST_Ngrammar: dense_representation requires explicit vocab"
          << endl;
      return false;
    }
    
    p_num_states = (int)pow(vocab->length(),p_order-1);
    p_states = new EST_NgrammarState[p_num_states];
    for (i=0; i < p_num_states; i++)
      p_states[i].init(i,pred_vocab);
    
    return true;
}

bool EST_Ngrammar::init_sparse_representation()
{
    
    if (vocab->length() <= 0)
    {
      cerr << "EST_Ngrammar: dense_representation requires explicit vocab"
          << endl;
      return false;
    }
    
    p_num_states = (int)pow(vocab->length(),p_order-1);
    p_states = new EST_NgrammarState[p_num_states];
    
    return (bool)(p_states != NULL);
}

bool EST_Ngrammar::init_backoff_representation()
{
    
    // nothing much to do
    backoff_representation = new EST_BackoffNgrammarState;
    backoff_representation->init(vocab,0);
    return true;
}


const EST_StrVector &EST_Ngrammar::make_ngram_from_index(const int index) const
{
    int i,ind=index;
    EST_StrVector *ngram = new EST_StrVector;
    ngram->resize(p_order-1); // exclude last word
    
    // matches 'find_dense_state_index' so first word is most significant
    // also, cannot fill in last word
    
    for(i=p_order-2;i>=0;i--)
    {
#if defined(sun) && ! defined(__svr4__)
/* SunOS */
      int rem = ind%vocab->length();
      int quot = ind/vocab->length();
      (*ngram)[i] = wordlist_index(rem);
      ind = quot;
#else
      div_t d = div(ind,vocab->length());
      (*ngram)[i] = wordlist_index(d.rem);
      ind = d.quot;
#endif
    }
    
    return *ngram;
}




bool EST_Ngrammar::init_vocab(const EST_StrList &word_list)
{
    vocab = new EST_Discrete();
    if(!vocab->init(word_list))
      return false;
    
    pred_vocab = vocab; // same thing in this case
    vocab_pdf.init(pred_vocab);
    
    return (bool)(vocab != NULL);
}

bool EST_Ngrammar::init_vocab(const EST_StrList &word_list,
                        const EST_StrList &pred_list)
{
    vocab = new EST_Discrete();
    if(!vocab->init(word_list))
      return false;
    pred_vocab = new EST_Discrete();
    if(!pred_vocab->init(pred_list))
      return false;
    vocab_pdf.init(pred_vocab);
    
    return (bool)(vocab != NULL);
}

bool EST_Ngrammar::check_vocab(const EST_StrList &word_list)
{
    EST_Discrete *comp_vocab = new EST_Discrete();
    
    if(!comp_vocab->init(word_list))
    {
      delete comp_vocab;
      return false;
    }
    
    if(*vocab != *comp_vocab)
    {
      delete comp_vocab;
      return false;
    }
    
    delete comp_vocab;
    return true;
}

const EST_String & EST_Ngrammar::wordlist_index(int i) const
{
    return vocab->name(i);
}

int EST_Ngrammar::predlist_index(const EST_String &word) const
{
    
    if (word=="") // can't lookup
      return -1;
    
    int i;
    i = pred_vocab->index(word);
    if(i >= 0 )
      return i;
    
    cerr << "Word \"" << word << "\" is not in the predictee word list" << endl;
    
    if (allow_oov)
    {
      i = pred_vocab->index(OOV_MARKER);
      if(i >= 0)
          return i;
      
      cerr << "Even " << OOV_MARKER << " is not in the predictee word list !" << endl;
    }
    
    return -1;
}


const EST_String & EST_Ngrammar::predlist_index(int i) const
{
    return pred_vocab->name(i);
}

int EST_Ngrammar::wordlist_index(const EST_String &word, const bool report) const
{
    
    if (word=="") // can't lookup
      return -1;
    
    int i;
    i = vocab->index(word);
    if(i >= 0 )
      return i;
    
    if(report)
      cerr << "Word \"" << word << "\" is not in the word list" << endl;
    
    if (allow_oov)
    {
      i = vocab->index(OOV_MARKER);
      if(i >= 0)
          return i;
      if(report)
          cerr << "Even " << OOV_MARKER << " is not in the word list !" << endl;
    }
    
    return -1;
}

bool EST_Ngrammar::build(const EST_StrList &filenames,
                   const EST_String &prev,
                   const EST_String &prev_prev,
                   const EST_String &last,
                   const EST_String &input_format,
                   const EST_String &oov_mode,
                   const int mincount,
                   const int maxcount)
{
    
    p_sentence_start_marker = prev;
    p_sentence_end_marker = last;


    // when backing off, safest modes  ...
    if( (p_representation == EST_Ngrammar::backoff) &&
       (oov_mode != "skip_file") &&
       (oov_mode != "skip_sentence"))
      cerr << "Warning : building a backoff grammar" << endl
          << "          with oov_mode '" << oov_mode 
            << "' is not recommended !" << endl;

    if( (oov_mode != "skip_ngram") &&
       (oov_mode != "skip_sentence") &&
       (oov_mode != "skip_file") &&
       (oov_mode != "use_oov_marker")  )
    {
      cerr << "Unknown oov_mode '" << oov_mode << "'" << endl;
      return false;
    }

    if( (oov_mode == "skip_sentence") &&
       (input_format == "ngram_per_line"))
    {
      cerr << "Sorry, with input format 'ngram_per_line' you cannot " << endl
          << " select oov_mode 'skip_sentence'" << endl;
      return false;
    }

    if(oov_mode == "use_oov_marker")
      allow_oov = true;
    else
      allow_oov = false;

    bool skip_this;
    EST_String new_filename;
    EST_Litem *p;
    for (p = filenames.head(); p; p = next(p))
    {
      cerr << "Building from " << filenames(p) << endl;

      skip_this=false;
      if( ((oov_mode == "skip_sentence") &&
           (input_format == "sentence_per_file")) ||
         (oov_mode == "skip_file")  )
          skip_this = oov_preprocess(filenames(p),new_filename,
                               "skip if found");

      else if( ((oov_mode == "skip_sentence") &&
              (input_format == "sentence_per_line")) ||
            ((oov_mode == "skip_ngram") &&
             (input_format == "ngram_per_line"))  )
          oov_preprocess(filenames(p),new_filename,"eliminate lines");

      else
          new_filename = filenames(p);


      if(skip_this)
      {
          cerr << "Skipping " << filenames(p) 
            << " (out of vocabulary words found)" << endl;
      }
      else
      {
          
          switch(p_representation)
          {
          case EST_Ngrammar::sparse:
          {
            if (input_format != "")
            {
                cerr << "Can't build sparse ngram from '" << input_format;
                cerr << "' data" << endl;
                return false;
            }
            else if (!build_sparse(new_filename,prev,prev_prev,last))
                return false;
          }
          break;
          
      case EST_Ngrammar::dense:
          if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
            return false;
          break;
          
      case EST_Ngrammar::backoff:
          if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
            return false;
          break;
          
      default:
          cerr << "Unknown internal representation set for EST_Ngrammar"
            << endl;
          return false;
          break;
      }
      }

      if((new_filename != filenames(p)) && 
         (new_filename != "") &&
         !delete_file(new_filename)  )
      {
          cerr << "Warning : couldn't remove temporary file : "
            << new_filename << endl;
      }
      
    } // loop round files

    if (p_representation == EST_Ngrammar::backoff)
      return compute_backoff_weights(mincount,maxcount);
    
    return true;
}

void EST_Ngrammar::accumulate(const EST_StrVector &words,
                        const double count)
{
    if (words.n() < p_order)
      cerr << "EST_Ngrammar::accumulate - window is too small" << endl;
    else
    {
      p_num_samples++;
      const EST_String &w = lastword(words);
      vocab_pdf.cumulate(w,count);
      
      switch(p_representation)
      {
      case EST_Ngrammar::dense:
      case EST_Ngrammar::sparse:
          find_state(words).cumulate(w,count);
          break;
          
      case EST_Ngrammar::backoff:
          backoff_representation->accumulate(words,count);
          break;
          
      default:
          cerr << "EST_Ngrammar::accumulate : invalid representation !"
            << endl;
          break;
      }
    }             // switch
}

void EST_Ngrammar::accumulate(const EST_IVector &words,
                        const double count)
{
    
    /*
       int i;
       for(i=0;i<words.n();i++)
       {
       cerr << vocab_pdf.item_name(words(i));
       cerr << " ";
       }
       cerr << endl;
       */
    
    if (words.n() < p_order)
      cerr << "EST_Ngrammar::accumulate - window is too small" << endl;
    else
    {
      p_num_samples++;
      vocab_pdf.cumulate(words(p_order-1),count);
      
      switch(p_representation)
      {
          
      case EST_Ngrammar::dense:
      case EST_Ngrammar::sparse:
          find_state(words).cumulate(words(p_order-1),count);
          break;
          
      case EST_Ngrammar::backoff:
          backoff_representation->accumulate(words,count);
          break;
          
      default:
          cerr << "EST_Ngrammar::accumulate : invalid representation !"
            << endl;
          break;
      }                 // switch
    }
}


bool EST_Ngrammar::ngram_exists(const EST_StrVector &words) const
{
    switch(p_representation)
    {
    case EST_Ngrammar::sparse:
      return false;
      break;
      
    case EST_Ngrammar::dense:
      return true;      // always exists !
      break;
      
    case EST_Ngrammar::backoff:
    {
      if(words.n()==1)
          return backoff_representation->ngram_exists(words,0);
      else
          return backoff_representation->ngram_exists(words,backoff_threshold);
    }
      break;
      
    default:
      cerr << "ngram_exists: unknown ngrammar representation" << endl;
      break;
    }
    return false;
}

bool EST_Ngrammar::ngram_exists(const EST_StrVector &words, const double threshold) const
{
    if (p_representation != EST_Ngrammar::backoff)
    {
      cerr << "Not a backoff grammar !" << endl;
      return false;
    }
    
    return backoff_representation->ngram_exists(words,threshold);
    
}


const double EST_Ngrammar::get_backoff_weight(const EST_StrVector &words) const
{
    if(p_representation == EST_Ngrammar::backoff)
      return backoff_representation->get_backoff_weight(words);
    else
    {
      cerr << "Can't get backoff weight - not a backed off ngrammar !" << endl;
      return 0;
    }
}

bool EST_Ngrammar::set_backoff_weight(const EST_StrVector &words, const double w)
{
    if(p_representation == EST_Ngrammar::backoff)
      return backoff_representation->set_backoff_weight(words,w);
    else
    {
      cerr << "Can't set backoff weight - not a backed off ngrammar !" << endl;
      return false;
    }
}


bool EST_Ngrammar::build_sparse(const EST_String &filename,
                        const EST_String &prev,
                        const EST_String &prev_prev,
                        const EST_String &last)
{
    sparse_representation.build(filename,prev,prev_prev,last);
    return true;
}


void EST_Ngrammar::fill_window_start(EST_IVector &window, 
                             const EST_String &prev,
                             const EST_String &prev_prev) const
{
    int i;
    
    for (i=0; i<window.n()-1; i++)
      window[i] = wordlist_index(prev_prev);
    window[i++] = wordlist_index(prev);
}

void EST_Ngrammar::fill_window_start(EST_StrVector &window, 
                             const EST_String &prev,
                             const EST_String &prev_prev) const
{
    int i;
    
    for (i=0; i<window.n()-1; i++)
      window[i] = prev_prev;
    window[i++] = prev;
}

bool EST_Ngrammar::oov_preprocess(const EST_String &filename,
                          EST_String &new_filename,
                          const EST_String &what)
{
    ostream *ost=0;
    EST_TokenStream ts;
    new_filename="";
    int bad_line_count=0;
    int good_line_count=0;

    // if filename is stdin we have to make a temporary file
    // if we are eliminating lines we also need a temporary file

    // what = "skip if found" | "eliminate lines"

    bool write_out = false;
    if( (what == "eliminate lines") || (filename == "-") )
      write_out = true;

    // open the original files for reading
    if (filename == "-")
    {
      if( ts.open(stdin, FALSE) == -1)
      {
          cerr << "EST_Ngrammar:: failed to open stdin";
          cerr << " for reading" << endl;
          return false;
      }
    }
    else if (ts.open(filename) == -1){
      cerr << "EST_Ngrammar: failed to open file \"" << filename 
          << "\" for reading" << endl;
      return false; // hmmm ?
    }
    
    // open the output file if necessary
    if(write_out)
    {
      new_filename = make_tmp_filename();
      ost = new ofstream(new_filename);

      if(!(*ost))
      {
          cerr << "Ngrammar: couldn't create temporary file \"" 
            << new_filename << "\"" << endl;
          new_filename="";
          return false;
      }
    }
    else
      new_filename = filename;

    EST_String s,this_line;
    bool bad_line=false;
    while (!ts.eof())
    {
      s=ts.get().string();
      
      if (!bad_line && (s != ""))
      {
          if(wordlist_index(s,false) < 0)
          {

            if(what == "eliminate lines")
            {
                bad_line=true;
            }
            else // skip file
            {
                if(write_out) 
                {
                  // input was stdin, but we are now discarding it
                  delete ost;
                  if(!delete_file(new_filename))
                      cerr << "Warning : couldn't delete temporary file '"
                        << new_filename << "'" << endl;
                }
                new_filename="";
                return false;
            }

          }
          else
            this_line += s + " ";
      }

      // write out
      if(ts.eoln())
      {
          if(bad_line)
          {
            bad_line_count++;
          }
          else
          {
            if(write_out)
            {
                // this_line
                *ost << this_line << endl;
                good_line_count++;
            }
          }
          bad_line=false;
          this_line = "";
      }

    }
    cerr << "skipped " << bad_line_count << " and kept "
      << good_line_count << " lines from file " << filename << endl;
    return true;
}

bool EST_Ngrammar::build_ngram(const EST_String &filename,
                         const EST_String &prev,
                         const EST_String &prev_prev,
                         const EST_String &last,
                         const EST_String &input_format)
{
    p_entry_type = EST_Ngrammar::frequencies;
    int bad_word=0;
    EST_String s;
    EST_TokenStream ts;
    int eoln_is_eos = FALSE;
    int sliding_window = TRUE;
    int count=0;
    clear();
    
    if ( (input_format == "") || (input_format == "sentence_per_line") )
    {
      // may do something here later
      eoln_is_eos = TRUE;
      sliding_window = TRUE;
    }
    else if (input_format == "sentence_per_file")
    {
      eoln_is_eos = FALSE;
      sliding_window = TRUE;
      p_number_of_sentences = 1;
    }
    else if(input_format == "ngram_per_line")
    {
      eoln_is_eos = FALSE;
      sliding_window = FALSE;
      p_number_of_sentences = 1;
    }
    else
    {
      cerr << "Can't build from '" << input_format << "' data" << endl;
      return false;
    }
    
    EST_IVector window(p_order);
    
    if (filename == "-")
    {
      if( ts.open(stdin, FALSE) == -1)
      {
          cerr << "EST_Ngrammar:: failed to open stdin";
          cerr << " for reading" << endl;
          return false;
      }
    }
    else if (ts.open(filename) == -1){
      cerr << "EST_Ngrammar: failed to open \"" << filename 
          << "\" for reading" << endl;
      return false;
    }
    
    // default input format is one sentence per line
    // full stops and commas etc. are taken literally
    // i.e. the user must do the preprocessing
    
    // we assume that all of prev,prev_prev,last are either
    // null or set, not a mixture of the two
    
    // at start window contains
    // [prev_prev, prev_prev, ....., prev_prev, prev]
    //  This is not added to the ngram model though
    if(sliding_window)
    {
      fill_window_start(window,prev,prev_prev);
      if (window(p_order-1) == -1)
          bad_word = p_order;
      else if( (p_order>1) && (window(p_order-2) == -1))
          bad_word = p_order-1;
      else
          bad_word=0;

      if(bad_word > 0)
        cerr << "at start : bad word at " << bad_word << endl;

    }
    while (!ts.eof())
    {
      s=ts.get().string();
      
      if (s != "")
      {
          if(sliding_window)
          {
            slide(window,-1);
            if (bad_word > 0)
                bad_word--;

            window[p_order-1] = wordlist_index(s);
            if (window(p_order-1) < 0)
            {
                cerr << "EST_Ngrammar::build_ngram " <<
                  " word \"" << s << "\" is not in vocabulary, skipping"
                      << endl;
                bad_word = p_order;
            }
            if (bad_word == 0)
                accumulate(window);
            else
            {
                cerr << "not accumulating : bad word at " << bad_word;
                cerr << " window=" << window; // l<< endl;
                bad_word--;
            }
          }
          else
          {
            // not a sliding window - wait for end of line and accumulate
            if(count < p_order)
            {
                if(count == p_order-1) // last thing in window (predictee)
                  window[count++] = predlist_index(s);
                else // not last thing (predictor)
                  window[count++] = wordlist_index(s);
                
                if (window(count-1) < 0)
                {
                  cerr << "EST_Ngrammar::build_ngram " <<
                      " word \"" << s << "\" is not in vocabulary, skipping"
                        << endl;
                  bad_word = 1;
                }
            }
            else
                cerr << "Too many items on line  - ignoring trailing ones !" << endl;
          }
      }
      
      // end of sentence ?
      if(ts.eoln())
      {
       
          if(!sliding_window)
          {
            if((count == p_order) && bad_word == 0)
                accumulate(window);
            count = 0;
            bad_word = 0;
          }
          else if (eoln_is_eos)
          {       
            // have there been any accumulates since the last increment
            if (window(p_order-1) != wordlist_index(last))
                p_number_of_sentences += 1;
            
            slide(window,-1);
            window[p_order-1] = wordlist_index(last);
            
            if(window(p_order-1) == -1)
            {
                //cerr << "WARNING : skipping over bad word '" 
                //<< last << "'" << endl;
                bad_word = p_order;
            }
            
            if (bad_word == 0)
                accumulate(window);
            
            fill_window_start(window,prev,prev_prev);
            
            // check for bad tags
            if (window(p_order-1) == -1)
                bad_word = p_order;
            else if( (p_order>1) && (window(p_order-2) == -1) )
                bad_word = p_order-1;
            else
                bad_word=0;
          }
      }
    }
    
    // if last accumulate was at end of sentence
    // we don't need to do the extra accumulate
    if ( sliding_window && (window(p_order-1) != wordlist_index(prev)))
    {
      
      // and finish with the ngram [.....last few words,last]
      slide(window,-1);
      window[p_order-1] = wordlist_index(last);
      
      if (window(p_order-1) == -1)
          bad_word=p_order;
      
      if (bad_word == 0)
      {
          accumulate(window);
          p_number_of_sentences += 1;
      }
    }
    
    ts.close();
    
    cerr << "Accumulated " << p_number_of_sentences << " sentences." << endl;
    return true;
}


void compute_backoff_weight(EST_Ngrammar *n, EST_StrVector &ngram, void*)
{
    int i,j;
    double sum1=0,sum2=0;
    
    
    EST_StrVector new_ngram;
    new_ngram.resize(ngram.n()-1);
    for(i=0;i<new_ngram.n();i++)
      new_ngram[i] = ngram(i);
    
    
       cerr << "computing backoff w for ";
       for(i=0;i<new_ngram.n();i++)
       cerr << new_ngram(i) << " ";
       cerr << "        \r";
       
       cerr << endl;
    
    // only have backoff weights if the associated n-1 gram exists
    if (!n->ngram_exists(new_ngram))
    {
      cerr << " NONE";
      
      // if ngram really exists, but was below threshold
      // set backoff weight to 1 (always back off)
      if (n->ngram_exists(new_ngram,0))
      {
          if(!n->set_backoff_weight(new_ngram,1))
            cerr << "WARNING : couldn't set weight !" << endl;
          cerr << " = 1";
      }
      cerr << endl;
      return;
    }
    
    // save
    EST_String tmp = ngram(ngram.n()-1);
    
    // for each possible word in the last position
    for(j=0;j<n->get_pred_vocab_length();j++)
    {
      ngram[ngram.n()-1] = n->get_pred_vocab_word(j);
      
      for(i=0;i<ngram.n();i++)
      cerr << ngram(i) << " ";
      
      if (n->ngram_exists(ngram))
      {
          cerr << n->probability(ngram) << " exists " << endl;
          // if we have the complete ngram, add it to sum1
          sum1 += n->probability(ngram);
      }
      else
      {
          
          // make this faster - take out of loop
          
          // or add the n-1 gram, excluding the first word to sum2
          EST_StrVector tmp_ngram;
          tmp_ngram.resize(ngram.n()-1);
          for(i=0;i<tmp_ngram.n();i++)
            tmp_ngram[i] = ngram(i+1);

          cerr << " unseen, P(";
          for(i=0;i<tmp_ngram.n();i++)
            cerr << tmp_ngram(i) << " ";

          cerr << ") = " << n->probability(tmp_ngram) << " " << endl;
          sum2 += n->probability(tmp_ngram);
      }
    }
    
    // and fill in the backoff weight
    
    if (sum2 == 0)      // no unseen ngrams, no backing off
    {
      if(!n->set_backoff_weight(new_ngram,1))
          cerr << "WARNING : couldn't set weight !" << endl;
      cerr << 0 << endl;
    }
    else
    {
      if (sum1 > 1)
      {
          cerr << "NEGATIVE WEIGHT for "; 
          for(i=0;i<new_ngram.n();i++)
            cerr << new_ngram(i) << " ";
          cerr << endl;
          
          cerr << "sum1=" << sum1 << " sum2=" << sum2;
          cerr << "  " << (1 - sum1) / sum2 << endl;
          
          for(j=0;j<n->get_pred_vocab_length();j++)
          {
            ngram[ngram.n()-1] = n->get_pred_vocab_word(j);
            
                        
               if (n->ngram_exists(ngram))
               {
               
               for(i=0;i<ngram.n();i++)
               cerr << ngram(i) << " ";
               cerr << " exists, prob = ";
               cerr << n->probability(ngram,false,true) << endl;
               }
               
          }
          sum1 = 1;
          sum2 = 1;     // to get a weight of 0
      }
      
      if(!n->set_backoff_weight(new_ngram, (1 - sum1) / sum2 ))
          cerr << "WARNING : couldn't set weight !" << endl;
      
      cerr <<  "sum1=" << sum1 << " sum2=" << sum2;
      cerr << "  weight=" << (1 - sum1) / sum2 << endl;
    }
    
    // restore
    ngram[ngram.n()-1] = tmp;    
    
}

bool EST_Ngrammar::compute_backoff_weights(const int mincount,
                                 const int maxcount)
{
    
    
    backoff_threshold = mincount;
    backoff_discount = new EST_DVector[p_order];
    
    int i,o;
    
    // but we must have children from the root node
    // for every unigram, since these can not be backed off
    // (must therefore be present, even if zero)
    // smoothing will fill in a floor for any zero frequencies
    
    backoff_restore_unigram_states();
    
    Good_Turing_discount(*this,maxcount);
    
    // and since some frequencies will have been set to zero
    // we have to prune away those branches of the tree
    //prune_backoff_representation();
    
    
    // now compute backoff weights, to satisfy the
    // 'probs sum to 1' condition
    
    // condition is (trigram case):
    // sum( p(w3|w1,w2) ) = 1, over all w1,w2
    
    // p(wd3|wd1,wd2)= 
    // if(trigram exists)           p_3(wd1,wd2,wd3)
    // else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2)
    // else                         p(wd3|w2)
    // and for a given wd3 they all sum to 1
    
    // so compute three sums :
    // sum(p_3(wd1,wd2,wd3)), for all w1,w2 where we have the trigram
    // sum(p(wd3|wd2)), for all w1,w2 where we have the bigram(w1,w2)
    //                  but not the trigram
    // sum(p(wd3|wd2)), for all w1,w2 where we don't have either
    
    // could probably do this recursively and more elegantly
    // but it makes my head hurt
    
    for (o=2;o<=order();o++) // for (o=1 .. .????
    {
      
      cerr << "Backing off order " << o << endl;
      //cerr << "=======================" << endl;
      
      EST_StrVector words;
      words.resize(o);
      
      // for each possible history (ngram, excluding last word)
      // compute the backoff weight
      for(i=0;i<o-1;i++)
          words[i]="";
      words[o-1] = "!FILLED!";
      iterate(words,&compute_backoff_weight,NULL);
      
      //cerr << "=========done==========" << endl;
      
    }
    
    return true;
}


void EST_Ngrammar::backoff_restore_unigram_states()
{
    // for every item in the root pdf, look for a child
    // and make it if not present
    
    // short cut is to cumulate some zero freq bigrams
    // to force construction of states where necessary
    EST_StrVector words;
    int j;
    words.resize(2);
    words[0] = "wibble"; // doesn't matter what this is, count is 0
    for(j=0;j<get_pred_vocab_length();j++)
    {
      words[1] = get_pred_vocab_word(j);
      backoff_representation->accumulate(words,0);
    }
    
}


void EST_Ngrammar::prune_backoff_representation(EST_BackoffNgrammarState *start_state)
{
    
    if (start_state == NULL)
      start_state = backoff_representation;
    
    //cerr << "pruning state " << start_state << endl << *start_state << endl;
    
    // remove any branches with zero frequency count
    
    // find children of this state with zero freq and zap them
    int k;
    double freq;
    EST_String name;
    for (k=start_state->pdf_const().item_start();
       !start_state->pdf_const().item_end(k);
       k = start_state->pdf_const().item_next(k))
    {
      start_state->pdf_const().item_freq(k,name,freq);
      if (freq < TINY_FREQ)
      {
          EST_BackoffNgrammarState *child = start_state->get_child(name);
          
          if (child!=NULL)
          {
            //cerr << "Zapping  " << name << " : " << child->level() 
            //<< " " << child<< endl;
            start_state->remove_child(child,name);
          }
      }
      
    }
    
    // then recurse through remaining children
    for (k=start_state->pdf_const().item_start();
       !start_state->pdf_const().item_end(k);
       k = start_state->pdf_const().item_next(k))
    {
      start_state->pdf_const().item_freq(k,name,freq);
      EST_BackoffNgrammarState *child = start_state->get_child(name);
      if (child!=NULL)
      {
          //cerr << "recursing to " << name << " : " << child->level() << endl;
          //if((child!=NULL) && (child->level() == 3))
          //cerr << *child  << endl;
          prune_backoff_representation(child);
      }
    }
}


ostream& operator<<(ostream& s, EST_Ngrammar &n)
{
    switch(n.p_representation)
    {
    case EST_Ngrammar::sparse:
      n.sparse_representation.print_freqs(s);
      break;
      
    case EST_Ngrammar::dense:
      s << "Dense" << endl;
      //        s << n.dense_representation << endl;
      break;
      
    case EST_Ngrammar::backoff:
      s << "Backoff" << endl;
      s << *(n.backoff_representation) << endl;
      break;
      
    default:
      cerr << "Unknown internal representation of EST_Ngrammar : can't print"
          << endl;
      break;
    }
    
    return s;
}

bool 
EST_Ngrammar::set_entry_type(EST_Ngrammar::entry_t new_type)
{
    if (new_type == p_entry_type)
      return true;
    
    // entry type conversion --- hmmmmm
    
    cerr << "Couldn't do entry type conversion !" << endl;
    return false;
}

bool EST_Ngrammar::sparse_to_dense()
{
    cerr << "EST_Ngrammar::sparse_to_dense() "
      <<" not implemented" << endl;
    return false;
}

bool EST_Ngrammar::dense_to_sparse()
{
    cerr << "EST_Ngrammar::dense_to_sparse()"
      <<" not implemented" << endl;
    return false;
}

int EST_Ngrammar::find_dense_state_index(const EST_IVector &words, 
                               int index) const
{
    int i,ind=0;
    for(i=0;i<p_order-1;i++)
      ind = ind*vocab->length() + words.a_no_check(i+index);
      
    return ind;
}

int EST_Ngrammar::find_next_state_id(int state, int word) const
{
    // Find a new state index from the current after moving with word
    int i,f;

    if (p_order == 1)
      return 0;
    for (f=1,i=0; i<p_order-2; i++)
      f*=vocab->length();
    return ((state%f)*vocab->length())+word;
}

EST_NgrammarState &EST_Ngrammar::find_state(const EST_StrVector &words)
{
    switch(p_representation)
    {
    case EST_Ngrammar::sparse:
      //    return p_states[sparse_representation.find_state(words)];
      return p_states[0];
      break;
      
    case EST_Ngrammar::dense:
    {
      EST_IVector tmp(words.n());
      int i;
      for(i=0;i<p_order-1;i++)
      {
          tmp[i] = wordlist_index(words(i));
          if (tmp(i) == -1) break;
      }
      tmp[i] = pred_vocab->index(words(i));
      if (tmp(i) == -1) break;
      return p_states[find_dense_state_index(tmp)];
    }
      break;
      
    case EST_Ngrammar::backoff:
      cerr << "find_state: not valid in backoff mode !" << endl;
      break;
      
    default:
      cerr << "find_state: unknown ngrammar representation" << endl;
      break;
    }
    
    return p_states[0];
}


const EST_NgrammarState &
EST_Ngrammar::find_state_const(const EST_StrVector &words) const
{
    switch(p_representation)
    {
    case EST_Ngrammar::sparse:
      //    return p_states[sparse_representation.find_state(words)];
      return p_states[0];
      break;
      
    case EST_Ngrammar::dense:
    {
      EST_IVector tmp(words.n());
      int i;
      for(i=0;i<p_order-1;i++)
      {
          tmp[i] = wordlist_index(words(i));
            if (tmp(i) == -1) break;
      }
      tmp[i] = pred_vocab->index(words(i));
      if (tmp(i) == -1) break;
      return p_states[find_dense_state_index(tmp)];
    }
      break;
      
    case EST_Ngrammar::backoff:
      cerr << "find_state_const: not valid in backoff mode !" << endl;
      break;
      
    default:
      cerr << "find_state: unknown ngrammar representation" << endl;
      break;
    }
    return p_states[0];
}


EST_NgrammarState &EST_Ngrammar::find_state(const EST_IVector &words)
{
    switch(p_representation)
    {
    case EST_Ngrammar::sparse:
      //    return p_states[sparse_representation.find_state(words)];
      return p_states[0];
      break;
      
    case EST_Ngrammar::dense:
      return p_states[find_dense_state_index(words)];
      break;
      
    case EST_Ngrammar::backoff:
      cerr << "find_state: not valid in backoff mode !" << endl;
      break;
      
    default:
      cerr << "find_state: unknown ngrammar representation" << endl;
      break;
    }
    return p_states[0];
}


const EST_NgrammarState &
EST_Ngrammar::find_state_const(const EST_IVector &words) const
{
    switch(p_representation)
    {
    case EST_Ngrammar::sparse:
      //    return p_states[sparse_representation.find_state(words)];
      return p_states[0];
      break;
    case EST_Ngrammar::dense:
      return p_states[find_dense_state_index(words)];
      break;
      
    case EST_Ngrammar::backoff:
      cerr << "find_state_const: not valid in backoff mode !" << endl;
      break;
      
    default:
      cerr << "find_state: unknown ngrammar representation" << endl;
      break;
    }
    
    return p_states[0];
}


bool EST_Ngrammar::set_representation(EST_Ngrammar::representation_t new_representation)
{
    
    if (new_representation == p_representation)
      return true;
    
    if (new_representation == EST_Ngrammar::sparse)
      return sparse_to_dense();
    else if (new_representation == EST_Ngrammar::dense)
      return dense_to_sparse();
    else
    {
      cerr << "set_representation: unknown ngrammar representation" << endl;
      return FALSE;
    }
}

double EST_Ngrammar::probability(const EST_StrVector &words, bool force, const bool trace) const
{
    // Probability of this ngram
    (void)force;
    
    switch(p_representation)
    {
    case EST_Ngrammar::sparse:
    case EST_Ngrammar::dense:
      return find_state_const(words).probability(lastword(words));
      break;
      
    case EST_Ngrammar::backoff:
      return backoff_probability(words,trace);
      break;
      
    default:
      cerr << "probability: unknown ngrammar representation" << endl;
      return -1;
      break;
    }
}

double EST_Ngrammar::frequency(const EST_StrVector &words, bool force, const bool trace) const
{
    // Frequency of this ngram
    (void)force;
    
    switch(p_representation)
    {
    case EST_Ngrammar::sparse:
    case EST_Ngrammar::dense:
      return find_state_const(words).frequency(lastword(words));
      break;
      
    case EST_Ngrammar::backoff:
      return backoff_probability(words,trace); // can't do freqs
      break;
      
    default:
      cerr << "probability: unknown ngrammar representation" << endl;
      return -1;
      break;
    }
}

const EST_String &EST_Ngrammar::predict(const EST_StrVector &words,
                              double *prob,int *state) const
{
    // What's the most probable word given this list of words 
    
    switch(p_representation)
    {
      case EST_Ngrammar::sparse:
      case EST_Ngrammar::dense:
        {
          const EST_NgrammarState &s = find_state_const(words);
          *state = s.id();
          return s.most_probable(prob);
        }
      break;
    
      case EST_Ngrammar::backoff:
      state=NULL; // there are no states per se
      return backoff_most_probable(words,prob);
      break;
    
      default:
      cerr << "probability: unknown ngrammar representation" << endl;
      return EST_String::Empty;
      break;
    }
}

const EST_String &EST_Ngrammar::predict(const EST_IVector &words,
                        double *prob,int *state) const
{
    // What's the most probable word given this list of words 
    
    switch(p_representation)
    {
      case EST_Ngrammar::sparse:
      case EST_Ngrammar::dense:
        {
          const EST_NgrammarState &s = find_state_const(words);
          *state = s.id();
          return s.most_probable(prob);
        }
      break;
    
      case EST_Ngrammar::backoff:
      cerr << "probability: IVector access to backoff not supported" << endl;
      return EST_String::Empty;
      break;
    
      default:
      cerr << "probability: unknown ngrammar representation" << endl;
      return EST_String::Empty;
      break;
    }
}

int EST_Ngrammar::find_state_id(const EST_StrVector &words) const
{
    switch(p_representation)
    {
      case EST_Ngrammar::sparse:
      case EST_Ngrammar::dense:
        {
          const EST_NgrammarState &s = find_state_const(words);
          return s.id();
      }
      default:
      cerr << "Ngrammar: representation doesn't support states" << endl;
      return 0;
      break;
    }
}

int EST_Ngrammar::find_state_id(const EST_IVector &words) const
{
    switch(p_representation)
    {
      case EST_Ngrammar::sparse:
      case EST_Ngrammar::dense:
        {
          const EST_NgrammarState &s = find_state_const(words);
          return s.id();
      }
      default:
      cerr << "Ngrammar: representation doesn't support states" << endl;
      return 0;
      break;
    }
}

EST_String EST_Ngrammar::get_vocab_word(int i) const
{
    if (vocab)
        return vocab->name(i);
    else
        return NOVOCAB;
}
 
int EST_Ngrammar::get_vocab_word(const EST_String &s) const
{
    int index = vocab->name(s);
    return index;
}

double EST_Ngrammar::reverse_probability(const EST_StrVector &words, 
                               bool force) const
{
    // Whats the probability of this ngram-1 given last word in ngram
    (void)force;
    
    switch(p_representation)
    {
      case EST_Ngrammar::sparse:
      case EST_Ngrammar::dense:
        {
          const EST_NgrammarState &s = find_state_const(words);
          // need number of occurences of words[p_order-1]
          return s.frequency(lastword(words))/
            vocab_pdf.frequency(lastword(words));
        }
      break;
    
      case EST_Ngrammar::backoff:
      return backoff_reverse_probability(words);
      break;
    
      default:
      cerr << "probability: unknown ngrammar representation" << endl;
      return -1;
      break;
    }
}

double EST_Ngrammar::reverse_probability(const EST_IVector &words, 
                               bool force) const
{
    // Whats the probability of this ngram-1 given last word in ngram
    (void)force;
    
    switch(p_representation)
    {
      case EST_Ngrammar::sparse:
      case EST_Ngrammar::dense:
        { 
          const EST_NgrammarState &s = find_state_const(words);
          // need number of occurences of words[p_order-1]
          return s.frequency(lastword(words))/
            vocab_pdf.frequency(lastword(words));
      }
      break;
    
      case EST_Ngrammar::backoff:
      cerr << "probability: reverse prob unavailable for backoff  ngram"
          << endl;
      return -1;
      break;
    
      default:
      cerr << "probability: unknown ngrammar representation" << endl;
      return -1;
      break;
    }
}

const EST_DiscreteProbDistribution &
EST_Ngrammar::prob_dist(int state) const
{
    return p_states[state].pdf_const();
}

const EST_DiscreteProbDistribution &
EST_Ngrammar::prob_dist(const EST_StrVector &words) const
{
    
    switch(p_representation)
    {
      case EST_Ngrammar::sparse:
      case EST_Ngrammar::dense:
      {
        const EST_NgrammarState &s = find_state_const(words);
        return s.pdf_const();
      }
      break;
    
    case EST_Ngrammar::backoff:
      return backoff_prob_dist(words);
      break;
    
    default:
      cerr << "probability: unknown ngrammar representation" << endl;
      return PSTnullProbDistribution;
      break;
  }
}

const EST_DiscreteProbDistribution &
EST_Ngrammar::prob_dist(const EST_IVector &words) const
{
    
    switch(p_representation)
    {
      case EST_Ngrammar::sparse:
      case EST_Ngrammar::dense:
      {
        const EST_NgrammarState &s = find_state_const(words);
        return s.pdf_const();
      }
      break;
    
    case EST_Ngrammar::backoff:
      cerr << "probability: unsupport IVector access of backoff ngram" << endl;
      return PSTnullProbDistribution;
      break;
    
    default:
      cerr << "probability: unknown ngrammar representation" << endl;
      return PSTnullProbDistribution;
      break;
  }
}

EST_read_status
EST_Ngrammar::load(const EST_String &filename)
{
    
    EST_read_status r_val;
    
    //if ((r_val = load_ngram_htk_ascii(filename, *this)) != wrong_format)
    //return r_val;
    //if ((r_val = load_ngram_htk_binary(filename, *this)) != wrong_format)
    //return r_val;
    if ((r_val = load_ngram_cstr_ascii(filename, *this)) != wrong_format)
      return r_val;
    if ((r_val = load_ngram_cstr_bin(filename, *this)) != wrong_format)
      return r_val;
    
    // maybe the file is compressed ?
    EST_Pathname fname(filename);
    EST_String tmp_fname("");
    
    // crude but effective
    if (fname.extension() == GZIP_FILENAME_EXTENSION)
      tmp_fname = uncompress_file_to_temporary(filename,
                                     "gzip --decompress --stdout");
    else if (fname.extension() == COMPRESS_FILENAME_EXTENSION)
      tmp_fname = uncompress_file_to_temporary(filename,"uncompress -c");
    
    if(tmp_fname != "")
    {
      r_val = load(tmp_fname);
      delete_file(tmp_fname);
      return r_val;
    }
    else
      return misc_read_error;
    
    cerr << "EST_Ngrammar::load can't determine ngrammar file type for input file " << filename << endl;
    return wrong_format;
}

EST_read_status
EST_Ngrammar::load(const EST_String &filename, const EST_StrList &wordlist)
{
    
    // for backed off grammars
    // need a wordlist to load ARPA format
    
    EST_read_status r_val;
    
    if ((r_val = load_ngram_arpa(filename, *this, wordlist)) != wrong_format)
      return r_val;
    
    // try other types, then check wordlist fits
    //if ((r_val = load_ngram_htk_ascii(filename, *this)) != wrong_format)
    //return r_val;
    //if ((r_val = load_ngram_htk_binary(filename, *this)) != wrong_format)
    //return r_val;
    if ((r_val = load_ngram_cstr_ascii(filename, *this)) != wrong_format)
    {
      if ((r_val == format_ok) && check_vocab(wordlist))
          return r_val;
      else
      {
          cerr << "Wordlist file does not match grammar wordlist !" << endl;
          return misc_read_error;
      }
    }
    
    if ((r_val = load_ngram_cstr_bin(filename, *this)) != wrong_format)
    {
      if ((r_val == format_ok) && check_vocab(wordlist))
          return r_val;
      else
      {
          cerr << "Wordlist does not match grammar !" << endl;
          return misc_read_error;
      }
    }
    
    
    cerr << "EST_Ngrammar::load can't determine ngrammar file type for input file " << filename << endl;
    return wrong_format;
}


void
EST_Ngrammar::make_htk_compatible()
{
    
    cerr << "EST_Ngrammar::make_htk_compatible() not written yet." << endl;
    return;
}

EST_write_status
EST_Ngrammar::save(const EST_String &filename, const EST_String type, 
               const bool trace,double floor)
{
    
    if (type == "")
      return save(filename,"cstr_ascii",false,floor); // choose default type
    if (type == "htk_ascii")
      return save_ngram_htk_ascii(filename, *this, floor);
    //if (type == "htk_binary")
    //return save_htk_binary(filename, *this);
    if (type == "arpa")
      return save_ngram_arpa(filename, *this);
    if (type == "cstr_ascii")
      return save_ngram_cstr_ascii(filename, *this,trace,floor);
    if (type == "cstr_bin")
      return save_ngram_cstr_bin(filename, *this, trace,floor);
    if (type == "wfst")
      return save_ngram_wfst(filename, *this);
    
    cerr << "EST_Ngrammar::save unknown output file type " << type << endl;
    return write_fail;
}

void EST_Ngrammar::iterate(EST_StrVector &words,
                     void (*function)(EST_Ngrammar *n,
                                  EST_StrVector &words,
                                  void *params), 
                     void *params)
{
    int i,j=-1;
    EST_String tmp;
    
    // find next item in ngram to fill in
    for(i=0;i<words.n();i++)
      if (words[i] == "")
      {
          j=i;
          break;
      }
    
    if (j==-1) 
    {
      // all filled in, so do the function
      (*function)(this,words,params);
      
    }
    else
    {
      
      tmp = words(j);
      if (j==p_order-1) // final position - use pred vocab
      {
          for(i=0;i<pred_vocab->length();i++){
            words[j] = pred_vocab->name(i);
            iterate(words,function,params);
          }
          
      }
      else
      {
          for(i=0;i<vocab->length();i++){
            words[j] = vocab->name(i);
            iterate(words,function,params);
          }
      }     
      words[j] = tmp;
    }
}

void EST_Ngrammar::const_iterate(EST_StrVector &words,
                         void (*function)(const EST_Ngrammar *const n,
                                      EST_StrVector &words,
                                      void *params), 
                         void *params) const
{
    int i,j=-1;
    EST_String tmp;
    
    // find next item in ngram to fill in
    for(i=0;i<words.n();i++)
      if (words[i] == "")
      {
          j=i;
          break;
      }
    
    if (j==-1)
    {
      // all filled in, so do the function
      (*function)(this,words,params);
      
    }
    else
    {
      
      tmp = words(j);
      if (j==p_order-1) // final position - use pred vocab
      {
          for(i=0;i<pred_vocab->length();i++){
            words[j] = pred_vocab->name(i);
            const_iterate(words,function,params);
          }
          
      }
      else
      {
          for(i=0;i<vocab->length();i++){
            words[j] = vocab->name(i);
            const_iterate(words,function,params);
          }
      }     
      words[j] = tmp;
    }
}

void EST_Ngrammar::print_freqs(ostream &os,double floor)
{
    
    if (p_representation == EST_Ngrammar::backoff)
      backoff_representation->print_freqs(os,p_order);
    else
    {
      int i,j,k;
      EST_IVector window(p_order-1);
      
      for (i=0; i < p_num_states; i++)
      {
          // print out each ngram : freq
          for (k=p_states[i].pdf().item_start();
             !p_states[i].pdf().item_end(k);
             k = p_states[i].pdf().item_next(k))
          {
            double freq;
            EST_String name;
            int ind = i;
            p_states[i].pdf().item_freq(k,name,freq);
            if (freq == 0)
                freq = floor;
            if (freq > 0)
            {
                for (j = p_order-2; j >= 0; j--)
                {
                  window[j] = ind%vocab->length();
                  ind /= vocab->length();
                }
                for (j = 0; j < p_order-1; j++)
                  os << wordlist_index(window(j)) << " ";
                os << name << " : " << freq << endl;
            }
          }
      }
    }
}

const EST_DiscreteProbDistribution &
EST_Ngrammar::backoff_prob_dist(const EST_StrVector &words) const
{
    // need to make this on the fly
    // by looking at all possible words in the final
    // position
    
    int i,j;
    EST_StrVector ngram;
    ngram.resize(words.n()+1);
    for(i=0;i<words.n();i++)
      ngram[i] = words(i);
    
    EST_DiscreteProbDistribution *p = new EST_DiscreteProbDistribution(pred_vocab);
    
    for(j=0;j<get_pred_vocab_length();j++)
    {
      ngram[ngram.n()-1] = get_pred_vocab_word(j);
      double tmp = backoff_probability(ngram,false);
      p->set_frequency(j,tmp); // actually probability
    }
    p->set_num_samples(1.0); // we can't do it in frequencies
    
    return *p;
}

const double EST_Ngrammar::get_backoff_discount(const int order, const double freq) const
{
    if(order > p_order)
    {
      cerr << "order too great in EST_Ngrammar::get_backoff_discount" << endl;
      return 0;
    }
    
    else if( (int)freq < backoff_discount[order-1].n())
      return backoff_discount[order-1]((int)freq);
    
    else
      return 0;
}

const double EST_Ngrammar::backoff_probability(const EST_StrVector &words,
                                     const bool trace) const
{
    const EST_BackoffNgrammarState *state;
    int i;
    EST_StrVector new_ngram;
    double f=0,f2=0;
    
    
    if (trace)
    {
      cerr << "backoff_probability( ";
      for(i=0;i<words.n();i++)
          cerr << words(i) << " ";
      cerr << ") ";
    }
    
    // are we down to the unigram ?
    if (words.n() == 1)
    {
      if (trace)
          cerr << "unigram " << backoff_representation->probability(words(0))
            << endl;
      
      f=backoff_representation->frequency(words(0));
      
      // it seems outrageous, but use a floor because unigram
      // probs of zero mess up backing off
      if(f>0)
          return f / backoff_representation->pdf_const().samples();
      else
          return backoff_unigram_floor_freq / backoff_representation->pdf_const().samples();
    }
    
    // the first n-1 words in the ngram -- deeply inefficient at the moment !
    // should pass separately
    new_ngram.resize(words.n()-1);
    for(i=0;i<new_ngram.n();i++)
      new_ngram[i] = words(i);
    
    // see if we have the ngram
    state=backoff_representation->get_state(words);
    
    if( (state != NULL) &&
       ((f=state->frequency(words(0))) > backoff_threshold) )
    {
      
      //if (trace)
      //cerr << "in state " << state << " = " << *state << endl << endl;
      
      
      // because f>0, the freq of new_ngram must be non-zero too
      
      // special case - we don't have a freq for !ENTER (or whatever it's called)
      // so use the no. of sentences used to build this grammar
      if((new_ngram(0) == p_sentence_start_marker) ||
         (new_ngram(0) == p_sentence_end_marker) )
      {
          f2 = p_number_of_sentences;
          if (trace)
            cerr << "special freq used : " << f2 << endl;
      }
      else
      {
          state=backoff_representation->get_state(new_ngram);
          if (state == NULL)
          {
            cerr << "Something went horribly wrong !" << endl;
            return -1;
          }
          // state->frequency(new_ngram(0)) is the number of times
          // we saw new_ngram
          
          f2=state->frequency(new_ngram(0));
          
          if (trace)
          {
            //cerr << "n-1 state is " << *state << endl;
            cerr << " using freq for " << new_ngram(0) << " of " << f2 << endl;
          }
      }
      
      if (trace)
      {
          
          cerr << " ..... got (" << f << " - "
            << get_backoff_discount(state->level()+1,f)
                << ")/" << f2 << " = "
                  << (f - get_backoff_discount(state->level()+1,f) ) / f2
                      << endl;
      }
      return (f - get_backoff_discount(state->level()+1,f) ) / f2;
    }
    
    else                // back off
    {
      
      double bo_wt = get_backoff_weight(new_ngram);
      
      for(i=0;i<new_ngram.n();i++)
          new_ngram[i] = words(i+1);
      
      if (trace)
      {
          cerr << "backed off(" << bo_wt << ") to (";
          for(i=0;i<new_ngram.n();i++)
            cerr << new_ngram(i) << " ";
          cerr << ")  ";
      }
      
      return bo_wt * backoff_probability(new_ngram,trace);
    }
    
    // should never reach here !
    // but gcc thinks it does
    cerr << "oops !";
    return -1;
    
}


const double 
EST_Ngrammar::backoff_reverse_probability_sub(const EST_StrVector &words,
                                    const EST_BackoffNgrammarState *root) const
{
    
    // so similar to backoff prob - should combine
    // to do ......
    
    const EST_BackoffNgrammarState *state;
    int i;
    EST_StrVector new_ngram;
    double f=0;
    
    /*    
       cerr << "backoff_rev_probability_sub( ";
       for(i=0;i<words.n();i++)
       cerr << words(i) << " ";
       cerr << ") ";
       */
    // are we down to the unigram ?
    if (words.n() == 1)
    {
      //cerr << "unigram " << root->probability(words(0))
      //<< endl;
      return root->probability(words(0));
    }
    
    // the first n-1 words in the ngram -- deeply inefficient at the moment !
    // should pass separately
    new_ngram.resize(words.n()-1);
    for(i=0;i<new_ngram.n();i++)
      new_ngram[i] = words(i);
    
    // see if we have the ngram
    state=root->get_state(words);
    
    
    if( (state != NULL) &&
       ((f=state->frequency(words(0))) > 0) )
    {
      // because f>0, the freq of new_ngram must be non-zero too
      state=root->get_state(new_ngram);
      if (state == NULL)
      {
          cerr << "Something went horribly wrong !" << endl;
          return -1;
      }
      // state->frequency(new_ngram(0)) is the number of times
      // we saw new_ngram
      //cerr << "got " << f << "/" << state->frequency(new_ngram(0)) << endl;
      return f / state->frequency(new_ngram(0));
    }
    
    else                // back off
    {
      
      double bo_wt = root->get_backoff_weight(new_ngram);
      //double bo_wt = root->get_backoff_weight(words);
      
      for(i=0;i<new_ngram.n();i++)
          new_ngram[i] = words(i+1);
      
      //cerr << "backed off(" << bo_wt << ") ";
      return bo_wt * backoff_reverse_probability_sub(new_ngram,root);
    }
    
    // should never reach here !
    // but gcc thinks it does
    cerr << "oops !";
    return -1;
    
}

const double 
EST_Ngrammar::backoff_reverse_probability(const EST_StrVector &words) const
{
    
    // probability of words 1...n-1 , given the last word
    // easier to compute from the backoff tree structure than
    // 'normal' probability
    
    // find level 1 child corresponding to history before last word
    EST_BackoffNgrammarState *state;
    state = backoff_representation->get_child(words(words.n()-1));
    
    if(state == NULL)
    {
      // predictee isn't there so ......... ???
      return 0;
    }
    
    // now find backoff probability of words 0...n-2
    // starting from this state
    return backoff_reverse_probability_sub(words,state);
    
}

const EST_String & 
EST_Ngrammar::backoff_most_probable(const EST_StrVector &words, 
                            double *prob) const
{
    return backoff_prob_dist(words).most_probable(prob);
}

void slide(EST_IVector &v, const int l)
{
    int i;
    
    // slide elements by 'l' without  wraparound
    
    if (l==0)
      return;
    
    else if (l<0)
    {
      // slide left
      
      for(i=0;i<v.n()+l;i++)
          v[i] = v(i-l);
      
      for(;i<v.n();i++)
          v[i] = 0;
      
    }
    else
    {
      // slide right
      
      for(i=v.n()-1;i>=l;i--)
          v[i] = v(i-l);
      
      for(;i>=0;i--)
          v[i] = 0;
    }
}

void
EST_Ngrammar::backoff_traverse(EST_BackoffNgrammarState *start_state,
                         void (*function)(EST_BackoffNgrammarState *s,
                                    void *params),
                         void *params)
{
    
    // apply the function to this node
    function(start_state,params);
    
    // and recurse down the tree
    int k;
    double freq;
    EST_String name;
    for (k=start_state->pdf_const().item_start();
       !start_state->pdf_const().item_end(k);
       k = start_state->pdf_const().item_next(k))
    {
      start_state->pdf_const().item_freq(k,name,freq);
      EST_BackoffNgrammarState *child = start_state->get_child(name);
      if (child!=NULL)
          backoff_traverse(child,function,params);
      
    }
}

void
EST_Ngrammar::backoff_traverse(EST_BackoffNgrammarState *start_state,
                         void (*function)(EST_BackoffNgrammarState *s,
                                    void *params),
                         void *params,
                         const int level)
{
    // apply the function to this node, if level is correct
    if (start_state->level() == level)
    {
      function(start_state,params);
    }
    else if (start_state->level() < level)
    {
      // and recurse down the tree if we haven't
      // reached the level yet
      int k;
      double freq;
      EST_String name;
      
      for (k=start_state->pdf_const().item_start();
           !start_state->pdf_const().item_end(k);
           k = start_state->pdf_const().item_next(k))
      {
          start_state->pdf_const().item_freq(k,name,freq);
          EST_BackoffNgrammarState *child = start_state->get_child(name);
          if (child!=NULL)
            backoff_traverse(child,function,params,level);
          
      }
      
      
    }
}
void
merge_other_grammar(EST_Ngrammar *n, EST_StrVector &ngram, void *params)
{
    
    EST_Ngrammar *other_n = (EST_Ngrammar*)((void**)params)[0];
    float *weight = (float*)((void**)params)[1];
    
    if(other_n->ngram_exists(ngram))
      n->accumulate(ngram,*weight * other_n->frequency(ngram));
    
}

bool
EST_Ngrammar::merge(EST_Ngrammar &n,float weight)
{
    EST_StrVector words;
    words.resize(p_order);
    
    void **params = new void*[2];
    params[0] = (void*)&n;
    params[1] = (void*)&weight;
    
    iterate(words,&merge_other_grammar,(void*)params);
    
    delete [] params;
    return true;
}


void slide(EST_StrVector &v, const int l)
{
    // slide elements by 'l' without  wraparound
    int i;
    
    if (l==0)
      return;
    
    else if (l<0)
    {
      // slide left
      
      for(i=0;i<v.n()+l;i++)
          v[i] = v(i-l);
      
      for(;i<v.n();i++)
          v[i] = "";
      
    }
    else
    { 
      // slide right
      
      for(i=v.n()-1;i>=l;i--)
          v[i] = v(i-l);
      
      for(;i>=0;i--)
          v[i] = "";
      
    }
}


Generated by  Doxygen 1.6.0   Back to index