Logo Search packages:      
Sourcecode: speech-tools version File versions  Download package

EST_String.h

/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                        Copyright (c) 1997                             */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/

#ifndef __EST_STRING_H__
#define __EST_STRING_H__

#ifdef HAVE_CONFIG_H
#    include "est_string_config.h"
#endif

class EST_String;
class EST_Regex;

#define EST_Regex_max_subexpressions 10

#include <string.h>
#ifdef NO_EST
#    include <iostream.h>
#else
#    include "EST_iostream.h"
#endif
#include <limits.h>
#include "EST_Chunk.h"
#include "EST_strcasecmp.h"
#include "EST_bool.h"

extern "C" void abort(void);

/** A non-copyleft implementation of a string class to use with
  * compilers that aren't GNU C++.
  *
  * Strings are reference-counted and reasonably efficiant (eg you
  * can pass them around, into and out of functions and so on
  * without worrying too much about the cost).
  *
  * The associated class EST_Regex can be used to represent regular
  * expressions.
  *
  * @see EST_Chunk
  * @see EST_Regex
  * @see string_example
  * @author Alan W Black <awb@cstr.ed.ac.uk>
  * @author Richard Caley <rjc@cstr.ed.ac.uk>
  * @version $Id: EST_String.h,v 1.2 2001/04/04 13:11:27 awb Exp $
  */

00077 class EST_String {

    /** For better libg++ compatibility. 
      * 
      * Includes String from char constructor which
      * tends to mask errors in use. Also reverses the () and [] operators.
      */
#   define __FSF_COMPATIBILITY__ (0)

    /** Allow gsub() to be used in multi-threaded applications
      * This will cause gsub to use a local table of substitution points
      * walloced for each gsub. Otherwise one global one is used which
      * should be faster, but non reentrant.
      */
#   define __GSUB_REENTRANT__ (1)

/// Gripe about wierd arguments like Nulls
#define __STRING_ARG_GRIPE__ (1)

///  When we find something to gripe about we die then and there.
#define __GRIPE_FATAL__ (1)

#if __GRIPE_FATAL__
#    define gripe(WHAT) (cerr<< ("oops! " WHAT "\n"),abort())
#else
#    define gripe(WHAT) (cerr<< ("oops! " WHAT "\n"))
#endif

#if __STRING_ARG_GRIPE__
#   define safe_strlen(S) ((S)?strlen(S):(gripe("null strlen"),0))
#   define CHECK_STRING_ARG(S) if (!(S)) gripe("null string arg")
#else
#   define safe_strlen(S) ((S)?strlen(S):0)
#   define CHECK_STRING_ARG(S) /* empty */
#endif

public:
    /// Global version string.
00115     static const char *version;

    /// Constant empty string
00118     static const EST_String Empty;

    /// Type of string size field.
00121     typedef int EST_string_size;
    /// Maximum string size.
#  define MAX_STRING_SIZE (INT_MAX)

private:
    /// Smart pointer to actual memory.
00127     EST_ChunkPtr memory;
    /// Size of string.
00129     EST_string_size size;

    // Make sure this is exactly the same as an EST_String. This is being too
    // clever by half.
  
    struct EST_dumb_string {
      EST_ChunkPtr memory;
      EST_string_size size;
    } ;

    /// Flags indicating which bit of a string to extract.
00140     enum EST_chop_direction {
      Chop_Before = -1,
      Chop_At     = 0,
      Chop_After  = 1
    };

    /// Simple utility which removes const-ness frommemory
00147     static inline EST_ChunkPtr &NON_CONST_CHUNKPTR(const EST_ChunkPtr &ecp) 
      { return *((EST_ChunkPtr *)&ecp);}

    /// private constructor which uses the buffer given.
00151     EST_String(int len, EST_ChunkPtr cp) {
      size=len;
      memory = cp;
    }

    /// Is more than one String represented by the same memory?
00157     int shareing (void) { return memory.shareing();}

    /**@name Finding substrings */
    //@{
    /// Find substring 
    int locate(const char *it, int len, int from, int &start, int &end) const;
    /// Find substring
00164     int locate(const EST_String &s, int from, int &start, int &end) const
      { return locate((const char *)s.memory, s.size, from, start, end); }
    /// Find match for regexp.
    int locate(EST_Regex &ex, int from, int &start, int &end, int *starts=NULL, int *ends=NULL) const;
    //@}


    /**@name Extract Substrings */
    //@{
    int extract(const char *it, int len, int from, int &start, int &end) const;
    int extract(const EST_String &s, int from, int &start, int &end) const
      { return extract((const char *)s.memory, s.size, from, start, end); }
    int extract(EST_Regex &ex, int from, int &start, int &end) const;
    //@}

    /**@name Chop out part of string */
    //@{
    /// Locate subsring and chop.
    EST_String chop_internal(const char *s, int length, int pos, EST_chop_direction directionult) const;
    /// Chop at given position.
    EST_String chop_internal(int pos, int length, EST_chop_direction directionult) const;
  
    /// Locate match for expression and chop.
    EST_String chop_internal(EST_Regex &ex, int pos, EST_chop_direction directionult) const;
    //@}

    /**@name Global search and replace */
    //@{
    /// Substitute for string
    int gsub_internal(const char *os, int olength, const char *s, int length);
    /// Substitute for matches of regexp.
    int gsub_internal(EST_Regex &ex, const char *s, int length);
    //@}

    /// Split the string down into parts. 
    int split_internal(EST_String result[], int max, const char* s_seperator, int slen, EST_Regex *re_separator, char quote) const;

    int Int(bool *ok_p) const;
    long Long(bool *ok_p) const;
    float Float(bool *ok_p) const;
    double Double(bool *ok_p) const;
public:

    /// Construct an empty string.
00208     EST_String(void) :memory() {size=0;}

    /// Construct from char *
    EST_String(const char *s);

    /// Construct from part of char * or fill with given character.
    EST_String(const char *s, int start_or_fill, int len);

    /// Construct from C string.
    EST_String(const char *s, int s_size, int start, int len);

    // Create from EST_String
    EST_String(const EST_String &s, int start, int len);

    /** Copy constructor
      * We have to declare our own copy constructor to lie to the
      * compier about the constness of the RHS.
      */
00226     EST_String(const EST_String &s) {
      memory = NON_CONST_CHUNKPTR(s.memory);
      size = s.size;
    }

#if __FSF_COMPATABILITY__
    /** Construct from single char.
      * This constructor is not usually included as it can mask errors.
      * @see  __FSF_COMPATABILITY__
      */
    EST_String(const char c);
#endif

    /// Destructor.
00240     ~EST_String() {
      size=0;
      memory=NULL;
    }

    /// Length of string ({\em not} length of underlying chunk)
00246     int length(void) const { return size; }
    /// Size of underlying chunk.
00248     int space (void) const { return memory.size(); }
    /// Get a const-pointer to the actual memory.
00250     const char *str(void) const { return size==0?"":(const char *)memory; }
    /// Get a writable pointer to the actual memory.
00252     char *updatable_str(void) { return size==0?(char *)"":(char *)memory; }
    void make_updatable(void) { ::make_updatable(memory, size+1);}


    /// Build string from a single character.
00257     static EST_String FromChar(const char c) 
      { const char s[2] = { c, 0 }; return EST_String(s); }

    /// Build string from an integer.
    static EST_String Number(int i, int base=10);

    /// Build string from a long integer.
    static EST_String Number(long i, int base=10);

    /// Build string from a double.
    static EST_String Number(double d);

    /// Build string from a float
    static EST_String Number(float f);

    /// Convert to an integer
00273     int Int(bool &ok) const { return Int(&ok); }
    int Int(void) const { return Int((bool *)NULL); }

    /// Convert to a long
00277     long Long(bool &ok) const { return Long(&ok); }
    long Long(void) const { return Long((bool *)NULL); }

    /// Convert to a float
00281     float Float(bool &ok) const { return Float(&ok); }
    float Float(void) const { return Float((bool *)NULL); }

    /// Convert to a double
00285     double Double(bool &ok) const { return Double(&ok); }
    double Double(void) const { return Double((bool *)NULL); }

    /**@name Before */
    //@{
    /// Part before position
00291     EST_String before(int pos, int len=0) const
      { return chop_internal(pos, len, Chop_Before); }
    /// Part before first matching substring after pos.
00294     EST_String before(const char *s, int pos=0) const
      { return chop_internal(s, safe_strlen(s), pos, Chop_Before); }
    /// Part before first matching substring after pos.
00297     EST_String before(const EST_String &s, int pos=0) const
      { return chop_internal(s.str(), s.size, pos, Chop_Before); }
    /// Part before first match of regexp after pos.
00300     EST_String before(EST_Regex &e, int pos=0) const
      { return chop_internal(e,  pos, Chop_Before); }
    //@}

    /**@name At */
    //@{
    /// Return part at position
00307     EST_String at(int from, int len=0) const
      { return EST_String(str(),size,from<0?(size+from):from,len); }
    /// Return part where substring found (not useful, included for completeness)
00310     EST_String at(const char *s, int pos=0) const
      { return chop_internal(s, safe_strlen(s), pos, Chop_At); }
    /// Return part where substring found (not useful, included for completeness)
00313     EST_String at(const EST_String &s, int pos=0) const
      { return chop_internal(s.str(), s.size, pos, Chop_At); }
    /// Return part matching regexp.
00316     EST_String at(EST_Regex &e, int pos=0) const
      { return chop_internal(e,  pos, Chop_At); }
    //@}

    /**@name After */
    //@{
    /// Part after pos+len
00323     EST_String after(int pos, int len=1) const
      { return chop_internal(pos, len, Chop_After); }
    /// Part after substring.
00326     EST_String after(const char *s, int pos=0) const
      { return chop_internal(s, safe_strlen(s), pos, Chop_After); }
    /// Part after substring.
00329     EST_String after(const EST_String &s, int pos=0) const
      { return chop_internal(s.str(), s.size, pos, Chop_After); }
    /// Part after match of regular expression.
00332     EST_String after(EST_Regex &e, int pos=0) const
      { return chop_internal(e,  pos, Chop_After); }
    //@}
  
    /**@name Search for something */
    //@{
    /// Find a substring.
00339     int search(const char *s, int len, int &mlen, int pos=0) const
      { int start, end;
      if (locate(s, len, pos, start, end))
      { mlen=end-start; return start; }
      return -1;
      }

    /// Find a substring.
00347     int search(const EST_String s, int &mlen, int pos=0) const
      { int start, end;
      if (locate(s, pos, start, end))
      { mlen=end-start; return start; }
      return -1;
      }

    /// Find a match of the regular expression.
00355     int search(EST_Regex &re, int &mlen, int pos=0, int *starts=NULL, int *ends=NULL) const
      { int start, end;
      if (locate(re, pos, start, end, starts, ends))
      { mlen=end-start; return start; }
      return -1;
      }
    //@}


    /**@name Get position of something */
    //@{
    /// Position of substring (starting at pos)
00367     int index(const char *s, int pos=0) const
      { int start, end; return locate(s, safe_strlen(s), pos, start, end)?start:-1; }
    /// Position of substring (starting at pos)
00370     int index(const EST_String &s, int pos=0) const
      { int start, end; return locate(s, pos, start, end)?start:-1; }
    /// Position of match of regexp (starting at pos)
00373     int index(EST_Regex &ex, int pos=0) const
      { int start, end; return locate(ex, pos, start, end)?start:-1; }
    //@}
  
    /**@name Does string contain something? */
    //@{
    /// Does it contain this substring?
00380     int contains(const char *s, int pos=-1) const
      { int start, end; return extract(s, safe_strlen(s), pos, start, end); }
    /// Does it contain this substring?
00383     int contains(const EST_String &s, int pos=-1) const
      { int start, end; return extract(s, pos, start, end); }
    /// Does it contain this character?
00386     int contains(const char c, int pos=-1) const
      { int start, end; char s[2] = {c,0}; return extract(s, 1, pos, start, end); }
    /// Does it contain a match for  this regular expression?
00389     int contains(EST_Regex &ex, int pos=-1) const
      { int start, end; return extract(ex, pos, start, end); }
    //@}

    /**@name Does string exactly match? */
    //@{
    /// Exatly match this string?
    int matches(const char *e, int pos=0) const;
    /// Exatly match this string?
    int matches(const EST_String &e, int pos=0) const;
    /// Exactly matches this regular expression, can return ends of sub-expressions.
    int matches(EST_Regex &e, int pos=0, int *starts=NULL, int *ends=NULL) const;
    //@}

    /**@name Global replacement */
    //@{
    /// Substitute one string for another.
00406     int gsub(const char *os, const EST_String &s)
      { return gsub_internal(os, safe_strlen(os), s, s.size); }
    /// Substitute one string for another.
00409     int gsub(const char *os, const char *s)
      { return gsub_internal(os, safe_strlen(os), s, safe_strlen(s)); }
    /// Substitute one string for another.
00412     int gsub(const EST_String &os, const EST_String &s)
      { return gsub_internal(os, os.size, s, s.size); }
    /// Substitute one string for another.
00415     int gsub(const EST_String &os, const char *s)
      { return gsub_internal(os, os.size, s, safe_strlen(s)); }

    /// Substitute string for matches of regular expression.
00419     int gsub(EST_Regex &ex, const EST_String &s)
      { return gsub_internal(ex, s, s.size); }
    /// Substitute string for matches of regular expression.
00422     int gsub(EST_Regex &ex, const char *s)
      { return gsub_internal(ex, s, safe_strlen(s)); }
    /// Substitute string for matches of regular expression.
00425     int gsub(EST_Regex &ex, int bracket_num)
      { return gsub_internal(ex, NULL, bracket_num); }
    /// Substitute the result of a match into a string.
    int subst(EST_String source, 
            int (&starts)[EST_Regex_max_subexpressions], 
            int (&ends)[EST_Regex_max_subexpressions]);
    //@}

    /**@name Frequency counts */
    //@{
    /// Number of occurances of substring
    int freq(const char *s) const;
    /// Number of occurances of substring
    int freq(const EST_String &s) const;
    /// Number of matches of regular expression.
    int freq(EST_Regex &s) const;
    //@}

    /**@name Quoting */
    //@{
    /// Return the string in quotes with internal quotes protected.
    EST_String quote(const char quotec) const;
    /// Return in quotes if there is something to protect (e.g. spaces)
    EST_String quote_if_needed(const char quotec) const;
    /// Remove quotes and unprotect internal quotes.
    EST_String unquote(const char quotec) const;
    /// Remove quotes if any.
    EST_String unquote_if_needed(const char quotec) const;
    //@}

#if __FSF_COMPATABILITY__
    const char operator [] (int i) const { return memory[i]; }
    char &operator () (int i) { return memory(i); }
#else
    /**@name Operators */
    //@{
    /// Function style access to constant strings.
00462     const char operator () (int i) const { return memory[i]; }
    /// Array style access to writable strings.
00464     char &operator [] (int i) { return memory(i); }
#endif

    /// Cast to const char * by simply giving access to pointer.
00468     operator const char*() const {return str(); }
    operator const char*() {return str(); }
    /// Cast to char *, may involve copying.
00471     operator char*() { return updatable_str(); }

    /**@name Add to end of string. */
    //@{
    /// Add C string to end of EST_String
    EST_String &operator += (const char *b);
    /// Add EST_String to end of EST_String
    EST_String &operator += (const EST_String b);
    //@}

    /**@name Asignment */
    //@{
    /// Assign C string to EST_String
    EST_String &operator = (const char *str);
    /// Assign single character to EST_String
    EST_String &operator = (const char c);
    /// Assign EST_String to EST_String.
    EST_String &operator = (const EST_String &s);
    //@}

    /**@name Concatenation */
    //@{
    /// Concatenate  two EST_Strings
    friend EST_String operator + (const EST_String &a, const EST_String &b);
    /// Concatenate C String with EST_String
    friend EST_String operator + (const char *a, const EST_String &b);
    /// Concatenate EST_String with C String
    friend EST_String operator + (const EST_String &a, const char *b);
    //@}

    /// Repeat string N times
    friend EST_String operator * (const EST_String &s, int n);

    /**@name relational operators */
    //@{
    ///
    friend int operator == (const char *a, const EST_String &b);
    ///
    friend int operator == (const EST_String &a, const char *b)
      { return b == a; }
    ///
    friend int operator == (const EST_String &a, const EST_String &b);

    ///
    friend int operator != (const char *a, const EST_String &b)
      { return !(a==b); }
    ///
    friend int operator != (const EST_String &a, const char *b)
      { return !(a==b); }
    ///
    friend int operator != (const EST_String &a, const EST_String &b)
      { return !(a==b); }

    ///
    friend inline int operator < (const char *a, const EST_String &b)
      { return compare(a,b) < 0; }
    ///
    friend inline int operator < (const EST_String &a, const char *b) 
      { return compare(a,b) < 0; }
    ///
    friend inline int operator < (const EST_String &a, const EST_String &b) 
      { return compare(a,b) < 0; }
    ///
    friend inline int operator > (const char *a, const EST_String &b) 
      { return compare(a,b) > 0; }
    ///
    friend inline int operator > (const EST_String &a, const char *b) 
      { return compare(a,b) > 0; }
    ///
    friend inline int operator > (const EST_String &a, const EST_String &b) 
      { return compare(a,b) > 0; }
    ///
    friend inline int operator <= (const char *a, const EST_String &b) 
      { return compare(a,b) <= 0; }
    ///
    friend inline int operator <= (const EST_String &a, const char *b) 
      { return compare(a,b) <= 0; }
    ///
    friend inline int operator <= (const EST_String &a, const EST_String &b) 
      { return compare(a,b) <= 0; }
    ///
    friend inline int operator >= (const char *a, const EST_String &b) 
      { return compare(a,b) >= 0; }
    ///
    friend inline int operator >= (const EST_String &a, const char *b) 
      { return compare(a,b) >= 0; }
    ///
    friend inline int operator >= (const EST_String &a, const EST_String &b) 
      { return compare(a,b) >= 0; }
    //@}

    //@}

    /**@name String comparison.
     * All these operators return -1, 0 or 1 to indicate the sort
     * order of the strings.
     */
    //@{
    /// 
    friend int compare(const EST_String &a, const EST_String &b);
    /// 
    friend int compare(const EST_String &a, const char *b);
    /// 
    friend inline int compare(const char *a, const EST_String &b)
      { return -compare(b,a);  }
    /** Case folded comparison.
      *
      * The table argument can defined how upper and lower
      * case characters correspond. The default works for
      * ASCII.
      */
    //@{
    friend int fcompare(const EST_String &a, const EST_String &b, 
                  const unsigned char *table=NULL);

    friend int fcompare(const EST_String &a, const char *b, 
                        const unsigned char *table=NULL);
    ///
    friend inline int fcompare(const EST_String &a, const EST_String &b, 
                         const EST_String &table) 
      { return fcompare(a, b, (const unsigned char *)(const char *)table); }
    //@}
    //@}
    //@}


    /**@name Split a string into parts.
      * 
      * These functions divide up a string producing an array of
      * substrings.
      */
    //@{
    /// Split at a given separator.
00604     friend int split(const EST_String & s, EST_String result[], 
                 int max, const EST_String& seperator, char quote=0)
      { return s.split_internal(result, max, (const char *)seperator, seperator.length(), NULL, quote); }
    /// Split at a given separator.
00608     friend int split(const EST_String &s, EST_String result[], 
                 int max, const char *seperator, char quote=0)
      { return s.split_internal(result, max, seperator, strlen(seperator), NULL, quote); }
    /// Split at each match of the regular expression.
00612     friend int split(const EST_String & s, EST_String result[], int max, 
                 EST_Regex& seperator, char quote=0)
      { return s.split_internal(result, max, NULL, 0, &seperator, quote); }
    //@}

    /// Convert to upper case.
    friend EST_String upcase(const EST_String &s);
    /// Convert to lower case.
    friend EST_String downcase(const EST_String &s);

    /** Concatenate a number of strings. 
      * This is more efficiant than multiple uses of + or +=
      */
    static EST_String cat(const EST_String s1, 
                    const EST_String s2 = Empty, 
                    const EST_String s3 = Empty, 
                    const EST_String s4 = Empty, 
                    const EST_String s5 = Empty,
                    const EST_String s6 = Empty,
                    const EST_String s7 = Empty,
                    const EST_String s8 = Empty,
                    const EST_String s9 = Empty
      );

  /*  Hacky way to ignore volatile */
      EST_String & ignore_volatile(void) volatile { return *((EST_String *)(void *)this); }

    /// Stream output for EST_String.
    friend ostream &operator << (ostream &s, const EST_String &str);
    friend class EST_Regex;
}; 

int operator == (const char *a, const EST_String &b);
int operator == (const EST_String &a, const EST_String &b);
ostream &operator << (ostream &s, const EST_String &str);

#include "EST_Regex.h"

#endif      

Generated by  Doxygen 1.6.0   Back to index