Logo Search packages:      
Sourcecode: speech-tools version File versions  Download package

url.c

/*************************************************************************/
/*                                                                       */
/* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */
/* University of Edinburgh.                                              */
/*                                                                       */
/* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,     */
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
/* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */
/* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF    */
/* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION    */
/* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.       */
/*                                                                       */
/*************************************************************************/
#ifdef FOR_LT

#include "lt-defs.h"
#include "lt-memory.h"
#include "lt-errmsg.h"
#include "lt-comment.h"
#include "lt-safe.h"
#include "nsl-err.h"

#define Strerror() strErr()
#define Malloc salloc
#define Realloc srealloc
#define Free sfree
#define fopen stdsfopen

#else

#include "system.h"

#define LT_ERROR(err, format) fprintf(stderr, format)
#define LT_ERROR1(err, format, arg) fprintf(stderr, format, arg)
#define LT_ERROR2(err, format, arg1, arg2) fprintf(stderr, format, arg1, arg2)
#define LT_ERROR3(err, format, arg1, arg2, arg3) fprintf(stderr, format, arg1, arg2, arg3)
#define WARN(err, format) fprintf(stderr, format)
#define WARN1(err, format, arg) fprintf(stderr, format, arg)

#define Strerror() strerror(errno)

#ifdef MAXPATHLEN
#define CWDBS MAXPATHLEN+1
#else
#define CWDBS 1025
#endif

#define GETWD(buf) getcwd(buf,CWDBS)

#endif /* FOR_LT */

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <assert.h>
#include <errno.h>
#include <string.h>           /* that's where strerror is.  really. */
#include <sys/types.h>

#ifdef WIN32
#include <direct.h>
#endif

#ifdef SOCKETS_IMPLEMENTED

#ifdef WIN32
#undef boolean
#include <winsock.h>
#include <fcntl.h>
#else
#include <unistd.h>
#include <netdb.h>
#include <sys/socket.h>
#include <netinet/in.h>
#endif

#endif

#include "string16.h"
#include "stdio16.h"
#include "url.h"

#ifdef HAVE_LIBZ
#include "zlib.h"
#ifdef macintosh
#include <fcntl.h>
#include <unix.h>
#endif
#endif

static FILE16 *http_open(const char *url, 
                   const char *host, int port, const char *path,
                   const char *type);
static FILE16 *file_open(const char *url,
                   const char *host, int port, const char *path,
                   const char *type);

static void parse_url(const char *url, 
                  char **scheme, char **host, int *port, char **path);

/* Mapping of scheme names to opening functions */

struct {
    char *scheme; 
    FILE16 *(*open)(const char *, const char *, int, const char *, const char *);
} schemes[] = {
    {(char *)"http", http_open},
    {(char *)"file", file_open},
};
#define NSCHEME (sizeof(schemes) / sizeof(schemes[0]))

/* Construct a default base URL, essentially file:`pwd`/ */

char *default_base_url(void)
{
    char buf[CWDBS];
    char *url;
    
    if(!GETWD(buf))
    {
      WARN(LEFILE, "Warning: can't get current directory for default base url\n");
      return strdup8("file:/");
    }


#ifdef WIN32

    /* DOS: translate C:\a\b to file:/C:/a/b/ */
    /* XXX should we escape anything? */
    {
    char *p;
    for(p=buf; *p; p++)
      if(*p == '\\')
          *p = '/';
    }
    url = Malloc(6 + strlen(buf) + 2);
    sprintf(url, "file:/%s/", buf);

#else
#ifdef mac_filenames

    /* Mac: translate a:b to file:/a/b/ */
    /* XXX should escape spaces and slashes, at least */
    {
      char *p;
      for(p=buf; *p; p++)
          if(*p == ':')
            *p = '/';
      /* Mac getcwd (always?) has a trailing separator, which we here bash */
      if(*--p == '/') 
          *p = 0;
    }
    url = Malloc(6 + strlen(buf) + 2);
    sprintf(url, "file:/%s/", buf);

#else

    /* Unix: translate /a/b to file:/a/b/ */

    url = Malloc(5 + strlen(buf) + 2);
    sprintf(url, "file:%s/", buf);

#endif
#endif

    return url;
}

/* 
 * Merge a URL with a base URL if necessary.
 * The merged URL is returned.
 * The parts of the URL are returned in scheme, host, port and path
 * if these are non-null.
 * Caller should free the results.
 */

char *url_merge(const char *url, const char *base,
                   char **_scheme, char **_host, int *_port, char **_path)
{
    char *merged_scheme, *merged_host, *merged_path, *merged_url;
    char *scheme=0, *host=0, *path=0;
    char *base_scheme=0, *base_host=0, *base_path=0;
    char *default_base=0;
    int port, base_port, merged_port, i, j;
    char *p;
    
    /* First see if we have an absolute URL */

    parse_url(url, &scheme, &host, &port, &path);
    if(scheme && (host || *path == '/'))
    {
      merged_scheme = scheme;
      merged_host = host;
      merged_port = port;
      merged_path = path;
      merged_url = strdup8(url);
      goto ok;
    }

    /* Relative URL, so we need the base URL */

    if(!base)
      base = default_base = default_base_url();

    parse_url(base, &base_scheme, &base_host, &base_port, &base_path);
    if(base_scheme && (base_host || *base_path == '/'))
      ;
    else 
    {
      LT_ERROR1(LEFILE, "Error: bad base URL <%s>\n", base);
      goto bad;
    }

    /* Determine merged path */

    if(path[0] == '/') 
    {
      /* not relative, use as-is */
      merged_path = path;
      path = 0;
    } 
    else 
    {
      /* relative, append to base path */

      merged_path = Malloc(strlen(base_path) + strlen(path) + 1);
      strcpy(merged_path, base_path);

      /* strip last component of base */

      for(i=strlen(merged_path)-1; i>=0 && merged_path[i] != '/'; i--)
          merged_path[i] = '\0';

      /* append relative path */

      strcat(merged_path, path);
      
      /* Remove . and .. components from path */

      p = merged_path;
      for(i=0; p[i]; ) 
      {
          assert(p[i] == '/');

          /* find next segment */

          for(j=i+1; p[j] && p[j] != '/'; j++)
            ;

          /* Do we have "." ? */

          if(j - i == 2 && p[i+1] == '.')
          {
            strcpy(&p[i+1], p[j] ? &p[j+1] : &p[j]);
            continue;
          }

          /* Do we have "<segment>/.."  with <segment> != ".." ? */

          /* (We know we're not looking at "./" so we don't have to
           * worry about "./..")
           */

          if(p[j] == '/' && p[j+1] == '.' && p[j+2] == '.' &&
             (p[j+3] == '/' || p[j+3] == '\0') &&
             (j - i != 3 || p[i+1] != '.' || p[i+2] != '.'))
          {
            strcpy(&p[i+1], p[j+3] ? &p[j+4] : &p[j+3]);
            i = 0;            /* start again from beginning */
            continue;
          }

          /* move to next segment */

          i = j;
      }
    }

    /* Check for deviant relative URLs like file:foo */

    if(scheme && !host && *path != '/')
    {
      if(strcmp(scheme, base_scheme) == 0)
      {
          WARN1(LEFILE,
        "Warning: relative URL <%s> contains scheme, contrary to RFC 1808\n",
              url);
      }
      else
      {
          LT_ERROR2(LEFILE,
           "Error: relative URL <%s> has scheme different from base <%s>\n",
                   url, base);
          goto bad;
      }
    }

    /* Return the parts and the whole thing */

    merged_scheme = base_scheme; if(scheme) Free(scheme);

    if(host)
    {
      merged_host = host; Free(base_host);
      merged_port = port;
    }
    else
    {
      merged_host = base_host;
      merged_port = base_port;
    }

    Free(path); Free(base_path);

    merged_url = Malloc(strlen(merged_scheme) + 1 + 
                  (merged_host ? 2 + strlen(merged_host) + 10 : 0) +
                  strlen(merged_path) + 1);
    if(merged_host) 
    {
      if(merged_port == -1)
          sprintf(merged_url, "%s://%s%s", 
                merged_scheme, merged_host, merged_path);
      else
          sprintf(merged_url, "%s://%s:%d%s",
                merged_scheme, merged_host, merged_port, merged_path);
    }
    else
      sprintf(merged_url, "%s:%s", merged_scheme, merged_path);

ok:
    Free(default_base);
    if(_scheme) *_scheme = merged_scheme; else Free(merged_scheme);
    if(_host) *_host = merged_host; else Free(merged_host);
    if(_port) *_port = merged_port;
    if(_path) *_path = merged_path; else Free(merged_path);

    return merged_url;

bad:
    Free(default_base);
    Free(scheme);
    Free(host);
    Free(path);
    Free(base_scheme);
    Free(base_host);
    Free(base_path);

    return NULL;
}

/* 
 * Open a stream to a URL.
 * url may be a relative URL, in which case it is merged with base,
 * which is typically the URL of the containing document.  If base
 * is null, file:`pwd`/ is used, which is the right thing to do for
 * filenames.  If base is "", there is no base URL and relative
 * URLs will fail.
 * If merged_url is non-null the resulting URL is stored in it.
 * If type begins "r", the URL is opened for reading, if "w" for
 * writing.  Writing is only supported for file URLs.
 * If the type begins "rl", the data will be copied to a temporary
 * file so that seeking is possible (NOT YET IMPLEMENTED).
 * Returns a FILE16 for success, NULL for failure.
 */

FILE16 *url_open(const char *url, const char *base, const char *type,
             char **merged_url)
{
    char *scheme, *host, *path, *m_url;
    int port, i;
    FILE16 *f;
#ifdef HAVE_LIBZ
    int len, gzipped = 0;
#endif

    /* Determine the merged URL */

    if(!(m_url = url_merge(url, base, &scheme, &host, &port, &path)))
      return 0;

#ifdef HAVE_LIBZ
    len = strlen(m_url);
    if(len > 3 && strcmp8(m_url+len-3, ".gz") == 0)
      gzipped = 1;
#endif

    /*
    printf("<%s> <%s> <%d> <%s>\n", scheme, host ? host : "", port, path);
    printf("%s\n", m_url);
    */

    /* Pass to the appropriate opening function */

    for(i=0; i<NSCHEME; i++)
      if(strcmp(scheme, schemes[i].scheme) == 0) 
      {
          f = schemes[i].open(m_url, host, port, path, type);
      
          Free(scheme);
          if(host)
            Free(host);
          Free(path);

          if(!f)
            return f;

#ifdef HAVE_LIBZ
          if(gzipped)
          {
            /* We have a gzip-compressed file which we hand to gzopen
             * for further processing.
             */
             gzFile gfile;
             FILE *file = GetFILE(f);

             if(!f)
             {
                 LT_ERROR1(LEFILE, 
                         "Can't attach gzip processor to URL \"%s\"\n",
                          m_url);
                 Free(m_url);
                 return 0;
             }
#ifdef macintosh
             gfile =gzdopen(dup(fileno(file)), *type == 'r' ? "rb" : "wb");
#else       
             gfile = gzdopen(dup(fileno(file)), type);
#endif
             Fclose(f);
             f = MakeFILE16FromGzip(gfile, type);
          }
#endif
          if(f && merged_url)
            *merged_url = m_url;
          else
            Free(m_url);

          return f;
      }

    /* Not implemented */

    LT_ERROR1(LEFILE, "Error: scheme \"%s\" not implemented\n", scheme);

    Free(scheme);
    if(host)
      Free(host);
    Free(path);
    Free(m_url);

    return 0;
}

/* Open an http URL */

static FILE16 *http_open(const char *url,
                   const char *host, int port, const char *path,
                   const char *type)
{
#ifndef SOCKETS_IMPLEMENTED
    LT_ERROR(NEUNSUP, 
            "http: URLs are not yet implemented on this platform\n");
    return 0;
#else
    FILE16 *f16;
    struct sockaddr_in addr;
    struct hostent *hostent;
    int s, server_major, server_minor, status, count, c;
    char reason[81];
#ifndef WIN32
    FILE *fin,*fout;
#else
    static int inited=0;
    int i;
    static char buf[1024];
    if (!inited) 
    {
      WORD version = MAKEWORD(1, 1);
      WSADATA wsaData;
      int err = WSAStartup(version, &wsaData);
      if (err)
      {
          LT_ERROR(LEFILE, "Error: can't init HTTP interface\n");
          return 0;
      }
      else if(LOBYTE(wsaData.wVersion) != 1 || HIBYTE(wsaData.wVersion) != 1)
      {
          LT_ERROR(LEFILE, "Error: wrong version of WINSOCK\n");
          WSACleanup();
          return 0;
      }
      inited = 1;
    }
#endif

    if(*type != 'r')
    {
      LT_ERROR1(LEFILE, "Error: can't open http URL \"%s\" for writing\n",
                 url);
      return 0;
    }

    if(!host)
    {
      LT_ERROR1(LEFILE, "Error: no host part in http URL \"%s\"\n", url);
      return 0;
    }

    /* Create the socket */

    s = socket(PF_INET, SOCK_STREAM, 0);
#ifdef WIN32
    if (s == INVALID_SOCKET) {
      LT_ERROR1(LEFILE, "Error: system call socket failed: %d\n",
               WSAGetLastError());
    };
#else
    if(s == -1) {
      LT_ERROR1(LEFILE, "Error: system call socket failed: %s\n",
                 Strerror());
      return 0;
    };
#endif

    /* Find the server address */

    hostent = gethostbyname(host);
    if(!hostent)
    {
      LT_ERROR1(LEFILE,
                 "Error: can't find address for host in http URL \"%s\"\n",
                 url);
      return 0;
    }

    memset(&addr, 0, sizeof(addr));
    addr.sin_family = AF_INET;
    /* If we were really enthusiastic, we would try all the host's addresses */
    memcpy(&addr.sin_addr, hostent->h_addr, hostent->h_length);
    addr.sin_port = htons((u_short)(port == -1 ? 80 : port));

    /* Connect */

    if(connect(s, (struct sockaddr *)&addr, sizeof(addr)) == -1)
    {
      LT_ERROR1(LEFILE, "Error: system call connect failed: %s\n",
                 Strerror());
      return 0;
    }

#ifndef WIN32
#ifdef macintosh
    fin = fdopen(s, "rb");
    setvbuf(fin, 0, _IONBF, 0);
    fout = fdopen(dup(s), "wb");
#else
    fin = fdopen(s, "r");
    setvbuf(fin, 0, _IONBF, 0);
    fout = fdopen(dup(s), "w");
#endif
#endif

    /* Send the request */

    /* 
     * Apparently on the Macintosh, \n might not be ASCII LF, so we'll
     * use numerics to be sure.
     */

#ifdef WIN32
    sprintf(buf, "GET %s HTTP/1.0\012\015Connection: close\012\015\012\015",
          path);
    if (send(s,buf,strlen8(buf),0)==SOCKET_ERROR) {
      LT_ERROR1(LEFILE, "Error: system call socket failed: %d\n",
               WSAGetLastError());
      /* XXX close the socket? */
      return 0;
    };      
#else
    fprintf(fout, "GET %s HTTP/1.0\012\015Connection: close\012\015\012\015",
          path);

    /* We used to test for errors after doing fclose, but this seemed
       to produce spurious errors under Linux (RedHat 4.2), so now we
       do fflush and test after that. */

    fflush(fout);
    if(ferror(fout))
    {
      LT_ERROR1(LEWRTF, "Error: write to socket failed: %s\n",Strerror());
      fclose(fout);
      fclose(fin);
      return 0;
    }
    fclose(fout);
#endif

    /* Read the status line */
#ifdef WIN32
    for(i=0; i<sizeof(buf)-1; i++)
    {
      if(recv(s, &buf[i], 1, 0) != 1)
          LT_ERROR1(LEFILE,
                   "Error: recv error from server for URL \"%s\"\n",
                   url);
      if(buf[i] == '\n')
          break;
    }
    count=sscanf(buf, "HTTP/%d.%d %d %80[^\012]", 
             &server_major, &server_minor, &status, reason);
#else    
    count=fscanf(fin, "HTTP/%d.%d %d %80[^\012]", 
             &server_major, &server_minor, &status, reason);
#endif

    if(count != 4)
    {
      LT_ERROR3(LEFILE,
                 "Error: bad header from server for URL \"%s\"\n%d %s\n",
                 url, count, Strerror());
#ifndef WIN32
      fclose(fin);
#endif
      return 0;
    }

    if(status != 200)
    {
      /* We should handle 301 (redirection) but we don't */
      LT_ERROR3(LEFILE, "Error: can't retrieve \"%s\": %d %s\n",
                 url, status, reason);
#ifndef WIN32
      fclose(fin);
#endif
      return 0;
    }

    /* Skip other headers */

    count = 0;
#ifdef WIN32
    while(recv(s, buf, 1, 0) == 1 && (c = buf[0], 1) || (c = EOF, 0))
#else
    while((c = getc(fin)) != EOF)
#endif
    {
      if(c == '\012')
          count++;
      else if(c != '\015')
          count = 0;
      if(count == 2)
          break;
    }

    if(c == EOF)
    {
      LT_ERROR1(LEFILE, "Error: EOF in headers retrieving \"%s\"\n", url);
#ifndef WIN32
      fclose(fin);
#endif
      return 0;
    }

#ifdef WIN32
    f16 = MakeFILE16FromWinsock(s, type);
#else
    f16 = MakeFILE16FromFILE(fin, type);
#endif

    SetCloseUnderlying(f16, 1);
    return f16;
#endif /* SOCKETS_IMPLEMENTED */
}

/* Open a file URL (easy, at least on unix) */

static FILE16 *file_open(const char *url,
                   const char *host, int port, const char *path, 
                   const char *type)
{
    FILE *f;
    FILE16 *f16;
    char *file;

    if(host && host[0])
      WARN1(LEFILE, "Warning: ignoring host part in file URL \"%s\"\n", url);

#ifdef WIN32

    /* DOS: translate /C:/a/b.c to C:\a\b.c */

    if(path[0] == '/' && path[1] && path[2] == ':')
      path++;

    file = strdup8(path);
    {
      char *p;
      for(p=file; *p; p++)
          if(*p == '/')
            *p = '\\';
    }

#else
#ifdef mac_filenames

    /* Mac: translate /a/b.c to a:b.c */

    if(*path == '/')
      path++;

    file = strdup8(path);
    {
      char *p;
      for(p=file; *p; p++)
          if(*p == '/')
            *p = ':';
    }
#else

    /* Unix: a path is a path is a path! */

    file = strdup8(path);

#endif
#endif

    /* XXX should undo any escapes */

    f = fopen(file, type);
    if(!f)
    {
      perror(file);
      Free(file);
      return 0;
    }

    Free(file);
    
    f16 = MakeFILE16FromFILE(f, type);
    SetCloseUnderlying(f16, 1);

    return f16;
}

static void parse_url(const char *url, 
                  char **scheme, char **host, int *port, char **path)
{
    char *p, *q;
    int warned = 0;

    *scheme = *host = *path = 0;
    *port = -1;

    /* Does it start with a scheme? */
    
    for(p = (char *)url; *p; p++)
      if(*p == ':' || *p == '/')
          break;

    if(p > url && *p == ':')
    {
      *scheme = Malloc(p - url + 1);
      strncpy(*scheme, url, p - url);
      (*scheme)[p - url] = '\0';
      url = p+1;
    }

    /* Does it have a net_loc? */

    if(url[0] == '/' && url[1] == '/')
    {
      url += 2;

      for(p = (char *)url; *p; p++)
          if(*p == '/')
            break;

      /* Does it have a port number? */

      for(q = p-1; q >= url; q--)
          if(!isdigit((int)*q))
            break;

      if(q < p-1 && *q == ':')
          *port = atoi(q+1);
      else
          q = p;

      *host = Malloc(q - url + 1);
      strncpy(*host, url, q - url);
      (*host)[q - url] = '\0';
      url = p;
    }

    /* The rest is the path */

    if(*url)
      *path = strdup8(url);
    else
      *path = strdup8("/");

    /* Windoze users have a tendency to use backslashes instead of slashes */

    for(p=*path; *p; p++)
      if(*p == '\\')
      {
          if(!warned)
          {
            WARN1(LEFILE, "Warning: illegal backslashes in URL path \"%s\""
                          "replaced by slashes\n", url);
            warned = 1;
          }

          *p = '/';
      }
}


Generated by  Doxygen 1.6.0   Back to index