Logo Search packages:      
Sourcecode: speech-tools version File versions  Download package

ngrammar_aux.cc

/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                         Copyright (c) 1996                            */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*                     Author :  Simon King & Alan W Black               */
/*                     Date   :  February 1997                           */
/*-----------------------------------------------------------------------*/
/*                                                                       */
/* Auxilliary functions for EST_Ngram class                              */
/*                                                                       */
/*=======================================================================*/
#include <iostream.h>
#include <string.h>
#include "EST_String.h"
#include "EST_Ngrammar.h"

static bool
ExponentialFit(EST_DVector &N, double &a, double &b, int first, int last)
{
  // fit the function  N(r) = e^a * r^b  where a,b are constants
  // to the points N(first)...N(last) inclusive
  // i.e. a straight line fit in the (natural) log domain
  // minimising the mean squared error (in log domain), is :
  
  // limitation : r must be an integer (it is the index of v)
  
  
  // default
  if (last == -1)
    last = N.n()-1;
  
  if (first < 0)
    {
      cerr << "ExponentialFit : first must be >= 0" << endl;
      return false;
    }
  
  if (last >= N.n()-1)
    {
      cerr << "ExponentialFit : last must be < N.n()-1 = " << N.n()-1 << endl;
    }
  
  if (first == last) 
    {
      a=log(N(first));
      b=0;
      return true;
    }
  
  double ElnNr=0.0,ElnNrlnr=0.0,
  Elnr=0.0,Elnr2=0.0,
  R=0.0;
  
  for(int r=first;r<=last;r++)
    {
      R += 1;
      if ( N(r) > 0)
      {
        ElnNr += log( N(r) );
        ElnNrlnr += log( N(r) ) * log( (double)r );
      }
      Elnr += log( (double)r );
      Elnr2 += log( (double)r ) * log( (double)r );
    }
  
  // and the answer is :
  b = ( (ElnNr*Elnr/R) - ElnNrlnr ) / ( (Elnr*Elnr/R) - Elnr2);
  a = (ElnNr - (b*Elnr) ) / R;
  
  return true;
}

static bool 
smooth_ExponentialFit(EST_DVector &N, int first, int last)
{
  double a=0.0,b=0.0;
  
  if (!ExponentialFit(N,a,b,first,last)) 
    {
      cerr << "smooth_ExponentialFit : ExponentialFit failed !" << endl;
      return false;
    }
  
  for(int r=first;r<=last;r++)
    N[r] = exp(a)* pow((float)r, b);
  
  return true;
}

void make_f_of_f(EST_BackoffNgrammarState *s,void *params)
{
    int k;
    double freq;
    EST_String name;

    EST_DVector *ff = (EST_DVector*)params;
    int max = ff->n();
    for (k=s->pdf_const().item_start();
       !s->pdf_const().item_end(k);
       k = s->pdf_const().item_next(k))
    {
      s->pdf_const().item_freq(k,name,freq);
      if(freq+0.5 < max)
          (*ff)[(int)(freq+0.5)] += 1;
        
    }


}

void get_max_f(EST_BackoffNgrammarState *s,void *params)
{
    int k;
    double freq;
    EST_String name;

    double *max = (double*)params;
    for (k=s->pdf_const().item_start();
       !s->pdf_const().item_end(k);
       k = s->pdf_const().item_next(k))
      {
        s->pdf_const().item_freq(k,name,freq);
        if(freq > *max)
          *max = freq;
        
      }
    
    
}

void map_f_of_f(EST_BackoffNgrammarState *s,void *params)
{
    int k;
    double freq;
    EST_String name;

    //cerr << "map_f_of_f : visited " << *s << endl;

    EST_DVector *map = (EST_DVector*)params;
    int max = map->n();
    for (k=s->pdf_const().item_start();
       !s->pdf_const().item_end(k);
       k = s->pdf_const().item_next(k))
      {
        s->pdf_const().item_freq(k,name,freq);
        if (freq+0.5 < max)
          {
            double nfreq = (*map)((int)(freq+0.5));
            s->pdf().set_frequency(name,nfreq);

          }
        
      }
 
}

void zero_small_f(EST_BackoffNgrammarState *s,void *params)
{
    int k;
    double freq;
    EST_String name;

    double *min = (double*)params;
    for (k=s->pdf_const().item_start();
       !s->pdf_const().item_end(k);
       k = s->pdf_const().item_next(k))
      {
        s->pdf_const().item_freq(k,name,freq);
        if (freq < *min)
        {
            //cerr << "zeroing " << freq << " < " << *min << endl;
            s->pdf().override_frequency(k,0.0);
        }
      }
}

void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order)
{
  int i,k,size;
  double max=0.0;

  // if ff has zero size, do complete frequency of frequencies
  // otherwise just fill in frequencies from 1 to ff.n()-1

  bool complete = (bool)(ff.n() == 0);

  switch(n.representation())
    {

      case EST_Ngrammar::sparse:
      case EST_Ngrammar::dense:
      {
          size = n.num_states();
          if (complete)
            {
              // find highest frequency in EST_Ngram
              for(i=0;i<size;i++)
                if (n.p_states[i].pdf().samples() > max)
                  max = n.p_states[i].pdf().samples();
              
              ff.resize((int)(max+1.5));
              ff.fill(0.0);
            }

          // Sum the frequencies
          for(i=0;i<size;i++)
            {
              for (k=n.p_states[i].pdf().item_start();
                   !n.p_states[i].pdf().item_end(k);
                   k = n.p_states[i].pdf().item_next(k))
                {
                  EST_String name;
                  double freq;
                  n.p_states[i].pdf().item_freq(k,name,freq);
                  ff[(int)(freq+0.5)] += 1;
                }
            }
          
          if (complete)
            {
              // fill in ff(0) - the number of unseen ngrams
              
              double total=0;
              for (i=1;i<ff.n();i++)
                total += ff(i);
              
              ff[0] = pow(n.get_vocab_length(),n.order()) - total;
            }
      }
      break;
      

      case EST_Ngrammar::backoff:
      {
          if (complete)
            {
              n.backoff_traverse(n.backoff_representation,
                             &get_max_f,(void*)(&max),
                             this_order-1);
              ff.resize((int)(max+1.5));
            }
          
          // backoff grammar has grammars of orders
          // order, order-1, ..., 1
          // and we treat each order independently

          for (i=0;i<ff.n();i++)
            ff[i] = 0;

          n.backoff_traverse(n.backoff_representation,
                         &make_f_of_f,(void*)(&ff),
                         this_order-1);
          
          if (complete)
            {
              // fill in ff(0) - the number of unseen ngrams              
              double total=0;
              for (i=1;i<ff.n();i++)
                total += ff(i);
              ff[0] = pow(n.get_vocab_length(),this_order) - total;



            }
      }
      break;

      default:
      cerr << "unknown representation for EST_Ngrammar" << endl;
      break;
    }

}

void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order)
{
  int i,k;


  switch(n.representation())
    {

      case EST_Ngrammar::sparse:
      case EST_Ngrammar::dense:
      {
          int size = n.p_num_states;
          
          for(i=0;i<size;i++)
            {
              for (k=n.p_states[i].pdf().item_start();
                   !n.p_states[i].pdf().item_end(k);
                   k = n.p_states[i].pdf().item_next(k))
                {
                  EST_String name;
                  double freq,nfreq;
                  n.p_states[i].pdf().item_freq(k,name,freq);
                  nfreq = map((int)(freq+0.5));
                  n.p_states[i].pdf().set_frequency(name,nfreq);


                }
            }
      }
      break;

      case EST_Ngrammar::backoff:
      {

          // backoff grammar has grammars of orders
          // order, order-1, ..., 1
          // and we treat each order independently
          n.backoff_traverse(n.backoff_representation,
                         &map_f_of_f,(void*)(&map),
                         this_order-1);
          
      }
      break;

      default:
      cerr << "unknown representation for EST_Ngrammar" << endl;
      break;

    }
}

static void 
adjusted_frequencies_BasicGoodTuring(EST_DVector &M,
                             const EST_DVector &N, 
                             int maxcount)
{
    // produce a map of frequencies, given a (smoothed) distribution
    // only map up to a frequency of maxcount; beyond that
    // map(r) == r
    
    if (maxcount > N.n()-2)
    {
      maxcount = N.n()-2;
      cerr << "adjusted_frequencies_BasicGoodTuring :";
      cerr << " maxcount is too big, reducing it to " << maxcount << endl;
    }
    
    M.resize(N.n());
    int r;
    for(r=0; r<=maxcount;r++)
    {
      // don't like this bit, but what can you do ?
      if( (N(r+1) == 0) || (N(r) == 0) )
          M[r] = r;
      else
          M[r] = (r + 1) * N(r+1) / N(r);
    }  
    // and do not map higher counts
    for(;r<N.n();r++)
      M[r]=r;
    
    return;
    
}

static void
smoothed_frequency_distribution_ExponentialFit(EST_DVector &N,int maxcount)
{
    if (maxcount > N.n()-2)
    {
      maxcount = N.n()-2;
      cerr << "smoothed_frequency_distribution_ExponentialFit :"
          << " maxcount too big, reducing it to " << maxcount << endl;
    }
    // the idea to fit this particular fn was by Steve Isard
    // we ignore r=0 in doing the fit
    
    if (!smooth_ExponentialFit(N,1,maxcount+1))
      cerr << "smooth_ExponentialFit failed !" << endl;
    
    return;
}

bool
Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount)
{
    // works for any N
    // since it just remaps frequencies
    
    // frequencies above maxcount are not changed
    // for discounting -- see discount routine !
    
    
    if (ngrammar.entry_type() != EST_Ngrammar::frequencies)
    {
      cerr << "EST_Ngram: cannot Good-Turing smooth ngram:"  <<
          " entries are not frequencies" << endl;
      return false;
    }
    
    switch(ngrammar.representation())
    {
      
      case EST_Ngrammar::sparse:
      case EST_Ngrammar::dense:
    {
      EST_DVector freqs,mapped_freqs;
      // grammar is of a single order - simple
      // Find frequency distribution
      frequency_of_frequencies(freqs,ngrammar);
      // smoothing should be optional - to do
      smoothed_frequency_distribution_ExponentialFit(freqs,maxcount-1);
      // Build map of frequencies
      adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,maxcount);
      // Map all frequencies in grammar to Good Turing Smoothed values
      map_frequencies(ngrammar,mapped_freqs);
      
    }
    break;
    
  case EST_Ngrammar::backoff:
{
    
    cerr << "Smoothing of backed of grammars is not available!" << endl;
    return false;
    
    (void)mincount;
    
    /*
       // need to smooth for each order independently
       int i,o;
       double threshold;
       
       //for (o=1;o<=ngrammar.order();o++)
       for(o=ngrammar.order();o<=ngrammar.order();o++)
       {
       
       EST_DVector freqs,mapped_freqs;
       
       frequency_of_frequencies(freqs,ngrammar,o);
       
       cerr << "FREQ : " << freqs << endl;
       
       if(freqs.n() < 2)
       {
       // i.e. only unseen things - zero them all
       threshold = 2; // 1 ?
       if(o>1)
       ngrammar.backoff_traverse(ngrammar.backoff_representation,
       &zero_small_f,
       (void*)(&threshold),
       o-1);
       continue;
       }
       
       int max = maxcount;
       if(max > freqs.n() - 2)
       max = freqs.n() - 2;
       
       if(max > 2)
       // need at least 3 points
       // smoothing should be optional - to do
       smoothed_frequency_distribution_ExponentialFit(freqs,max);
       
       cerr << "SMOOTHED : " << freqs << endl;
       
       
       adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max);
       
       // initial map of frequencies modifies total counts
       // so pdfs are correct
       
       map_frequencies(ngrammar,mapped_freqs,o);
       
       
       cerr << "Map for " << o << " : ";
       for(i=0;i<max;i++)
       cerr << mapped_freqs(i) << " ";
       cerr << endl;
       
       cerr << "MAP : " << mapped_freqs << endl;
       
       // now go and zero the small frequencies
       // but not for unigram
       
       
       if(mincount < mapped_freqs.n())
       threshold = mapped_freqs(mincount);
       else
       threshold = mapped_freqs(mapped_freqs.n()-1) + 1; // i.e. zero everything
       
       //cerr << "min = " << threshold << endl;
       
       if(o>1)
       ngrammar.backoff_traverse(ngrammar.backoff_representation,
       &zero_small_f,
       (void*)(&threshold),
       o-1);
       */
    
}
break;

default:
cerr << "unknown representation for EST_Ngrammar" << endl;
break;

}

return true;

}


void
Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount,
                 const double default_discount)
{
    
    if(ngrammar.representation() != EST_Ngrammar::backoff)
    {
      cerr << "Good_Turing_discount is not appropriate for non backoff grammar !"
          << endl;
      return;
    }
    
    
    
    // method (for each grammar order):
    // produce Good-Turing smoothed frequency map
    // compute difference between unsmoothed map (0,1,2,3...)
    // and GT map  - this is the discount
    // we do not subtract the discount here since we need to
    // preserve the raw counts
    
    // discounting is applied up to frequency 'maxcount'
    // beyond which we do not discount
    
    // to do : zero low frequencies ???? (prune the tree)
    /*
       ngrammar.backoff_traverse(ngrammar.backoff_representation,
       &zero_small_f,
       (void*)(&threshold),
       o-1);
       */    
    
    // need to treat for each order independently
    int i,o;
    
    // no discounting for order 1 - YES !?!?!?
    for (o=1;o<=ngrammar.order();o++)
    {
      
      EST_DVector freqs,mapped_freqs;
      frequency_of_frequencies(freqs,ngrammar,o);
      
      int max = maxcount;
      if(max > freqs.n() - 2)
          max = freqs.n() - 2;
      
      if(max > 2)
      {
          // need at least 3 points
          // smoothing should be optional - to do
          
          // April 97
          // to overcome problems with N(x)=0, shift data points, fit, and shift back
          
          for(i=0;i<=max+1;i++)
            freqs[i] += 1;
          
          smoothed_frequency_distribution_ExponentialFit(freqs,max);
          
          for(i=0;i<=max+1;i++)
          {
            freqs[i] -= 1;
            // possibly unneccesary check
            if ( freqs(i) < 0 )
                freqs[i] = 0;
          }
      }
      
      adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max);
      
      // and put it in the discount array
      ngrammar.backoff_discount[o-1].resize(freqs.n());
      for(i=(int)ngrammar.backoff_threshold;i<=max;i++)
      {
          ngrammar.backoff_discount[o-1][i] =  (double)i - mapped_freqs(i);
          
          // sanity check
          if( ngrammar.backoff_discount[o-1][i] < 0)
          {
            ngrammar.backoff_discount[o-1][i] = 0;
          }
      }
      
      for(;i<freqs.n();i++)
          ngrammar.backoff_discount[o-1][i] = default_discount;

    }
}

VAL_REGISTER_CLASS(ngrammar,EST_Ngrammar)


Generated by  Doxygen 1.6.0   Back to index