/* tld.c --- Handle TLD restriction checking.
* Copyright (C) 2004, 2005, 2006, 2007 Simon Josefsson.
* Copyright (C) 2003, 2004 Free Software Foundation, Inc.
*
* Author: Thomas Jacob, Internet24.de
*
* This file is part of GNU Libidn.
*
* GNU Libidn is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* GNU Libidn is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with GNU Libidn; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*
*/
/* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */
#include <stringprep.h>
/* Get strcmp(). */
#include <string.h>
/* Get specifications. */
#include <tld.h>
/* Array of built-in domain restriction structures. See tlds.c. */
extern const Tld_table *_tld_tables[];
/**
* tld_get_table - get table for a TLD name in table
* @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
* @tables: Zero terminated array of #Tld_table info-structures for
* TLDs.
*
* Get the TLD table for a named TLD by searching through the given
* TLD table array.
*
* Return value: Return structure corresponding to TLD @tld by going
* thru @tables, or return %NULL if no such structure is found.
*/
const Tld_table *
tld_get_table (const char *tld, const Tld_table ** tables)
{
const Tld_table **tldtable = NULL;
if (!tld || !tables)
return NULL;
for (tldtable = tables; *tldtable; tldtable++)
if (!strcmp ((*tldtable)->name, tld))
return *tldtable;
return NULL;
}
/**
* tld_default_table - get table for a TLD name
* @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
* @overrides: Additional zero terminated array of #Tld_table
* info-structures for TLDs, or %NULL to only use library deault
* tables.
*
* Get the TLD table for a named TLD, using the internal defaults,
* possibly overrided by the (optional) supplied tables.
*
* Return value: Return structure corresponding to TLD @tld_str, first
* looking through @overrides then thru built-in list, or %NULL if
* no such structure found.
*/
const Tld_table *
tld_default_table (const char *tld, const Tld_table ** overrides)
{
const Tld_table *tldtable = NULL;
if (!tld)
return NULL;
if (overrides)
tldtable = tld_get_table (tld, overrides);
if (!tldtable)
tldtable = tld_get_table (tld, _tld_tables);
return tldtable;
}
#define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
(c) == 0xFF0E || (c) == 0xFF61)
/**
* tld_get_4 - extract top level domain part in input Unicode string
* @in: Array of unicode code points to process. Does not need to be
* zero terminated.
* @inlen: Number of unicode code points.
* @out: Zero terminated ascii result string pointer.
*
* Isolate the top-level domain of @in and return it as an ASCII
* string in @out.
*
* Return value: Return %TLD_SUCCESS on success, or the corresponding
* #Tld_rc error code otherwise.
*/
int
tld_get_4 (const uint32_t * in, size_t inlen, char **out)
{
const uint32_t *ipos;
size_t olen;
*out = NULL;
if (!in || inlen == 0)
return TLD_NODATA;
ipos = &in[inlen - 1];
olen = 0;
/* Scan backwards for non(latin)letters. */
while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) ||
(*ipos >= 0x61 && *ipos <= 0x7A)))
ipos--, olen++;
if (olen > 0 && DOTP (*ipos)) /* Found something that appears a TLD. */
{
char *out_s = malloc (sizeof (char) * (olen + 1));
char *opos = out_s;
if (!opos)
return TLD_MALLOC_ERROR;
ipos++;
/* Transcribe to lowercase ascii string. */
for (; ipos < &in[inlen]; ipos++, opos++)
*opos = *ipos > 0x5A ? *ipos : *ipos + 0x20;
*opos = 0;
*out = out_s;
return TLD_SUCCESS;
}
return TLD_NO_TLD;
}
/**
* tld_get_4z - extract top level domain part in input Unicode string
* @in: Zero terminated array of unicode code points to process.
* @out: Zero terminated ascii result string pointer.
*
* Isolate the top-level domain of @in and return it as an ASCII
* string in @out.
*
* Return value: Return %TLD_SUCCESS on success, or the corresponding
* #Tld_rc error code otherwise.
*/
int
tld_get_4z (const uint32_t * in, char **out)
{
const uint32_t *ipos = in;
if (!in)
return TLD_NODATA;
while (*ipos)
ipos++;
return tld_get_4 (in, ipos - in, out);
}
/**
* tld_get_z - extract top level domain part in input string
* @in: Zero terminated character array to process.
* @out: Zero terminated ascii result string pointer.
*
* Isolate the top-level domain of @in and return it as an ASCII
* string in @out. The input string @in may be UTF-8, ISO-8859-1 or
* any ASCII compatible character encoding.
*
* Return value: Return %TLD_SUCCESS on success, or the corresponding
* #Tld_rc error code otherwise.
*/
int
tld_get_z (const char *in, char **out)
{
uint32_t *iucs;
size_t i, ilen;
int rc;
ilen = strlen (in);
iucs = calloc (ilen, sizeof (*iucs));
if (!iucs)
return TLD_MALLOC_ERROR;
for (i = 0; i < ilen; i++)
iucs[i] = in[i];
rc = tld_get_4 (iucs, ilen, out);
free (iucs);
return rc;
}
/*
* tld_checkchar - verify that character is permitted
* @ch: 32 bit unicode character to check.
* @tld: A #Tld_table data structure to check @ch against.
*
* Verify if @ch is either in [a-z0-9-.] or mentioned as a valid
* character in @tld.
*
* Return value: Return the #Tld_rc value %TLD_SUCCESS if @ch is a
* valid character for the TLD @tld or if @tld is %NULL,
* %TLD_INVALID if @ch is invalid as defined by @tld.
*/
static int
_tld_checkchar (uint32_t ch, const Tld_table * tld)
{
const Tld_table_element *s, *e, *m;
if (!tld)
return TLD_SUCCESS;
/* Check for [-a-z0-9.]. */
if ((ch >= 0x61 && ch <= 0x7A) ||
(ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch))
return TLD_SUCCESS;
s = tld->valid;
e = s + tld->nvalid;
while (s < e)
{
m = s + ((e - s) >> 1);
if (ch < m->start)
e = m;
else if (ch > m->end)
s = m + 1;
else
return TLD_SUCCESS;
}
return TLD_INVALID;
}
/**
* tld_check_4t - verify that characters are permitted
* @in: Array of unicode code points to process. Does not need to be
* zero terminated.
* @inlen: Number of unicode code points.
* @errpos: Position of offending character is returned here.
* @tld: A #Tld_table data structure representing the restrictions for
* which the input should be tested.
*
* Test each of the code points in @in for whether or not
* they are allowed by the data structure in @tld, return
* the position of the first character for which this is not
* the case in @errpos.
*
* Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
* points are valid or when @tld is null, %TLD_INVALID if a
* character is not allowed, or additional error codes on general
* failure conditions.
*/
int
tld_check_4t (const uint32_t * in, size_t inlen, size_t * errpos,
const Tld_table * tld)
{
const uint32_t *ipos;
int rc;
if (!tld) /* No data for TLD so everything is valid. */
return TLD_SUCCESS;
ipos = in;
while (ipos < &in[inlen])
{
rc = _tld_checkchar (*ipos, tld);
if (rc != TLD_SUCCESS)
{
if (errpos)
*errpos = ipos - in;
return rc;
}
ipos++;
}
return TLD_SUCCESS;
}
/**
* tld_check_4tz - verify that characters are permitted
* @in: Zero terminated array of unicode code points to process.
* @errpos: Position of offending character is returned here.
* @tld: A #Tld_table data structure representing the restrictions for
* which the input should be tested.
*
* Test each of the code points in @in for whether or not
* they are allowed by the data structure in @tld, return
* the position of the first character for which this is not
* the case in @errpos.
*
* Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
* points are valid or when @tld is null, %TLD_INVALID if a
* character is not allowed, or additional error codes on general
* failure conditions.
*/
int
tld_check_4tz (const uint32_t * in, size_t * errpos, const Tld_table * tld)
{
const uint32_t *ipos = in;
if (!ipos)
return TLD_NODATA;
while (*ipos)
ipos++;
return tld_check_4t (in, ipos - in, errpos, tld);
}
/**
* tld_check_4 - verify that characters are permitted
* @in: Array of unicode code points to process. Does not need to be
* zero terminated.
* @inlen: Number of unicode code points.
* @errpos: Position of offending character is returned here.
* @overrides: A #Tld_table array of additional domain restriction
* structures that complement and supersede the built-in information.
*
* Test each of the code points in @in for whether or not they are
* allowed by the information in @overrides or by the built-in TLD
* restriction data. When data for the same TLD is available both
* internally and in @overrides, the information in @overrides takes
* precedence. If several entries for a specific TLD are found, the
* first one is used. If @overrides is %NULL, only the built-in
* information is used. The position of the first offending character
* is returned in @errpos.
*
* Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
* points are valid or when @tld is null, %TLD_INVALID if a
* character is not allowed, or additional error codes on general
* failure conditions.
*/
int
tld_check_4 (const uint32_t * in, size_t inlen, size_t * errpos,
const Tld_table ** overrides)
{
const Tld_table *tld;
char *domain;
int rc;
if (errpos)
*errpos = 0;
/* Get TLD name. */
rc = tld_get_4 (in, inlen, &domain);
if (rc != TLD_SUCCESS)
{
if (rc == TLD_NO_TLD) /* No TLD, say OK */
return TLD_SUCCESS;
else
return rc;
}
/* Retrieve appropriate data structure. */
tld = tld_default_table (domain, overrides);
free (domain);
return tld_check_4t (in, inlen, errpos, tld);
}
/**
* tld_check_4z - verify that characters are permitted
* @in: Zero-terminated array of unicode code points to process.
* @errpos: Position of offending character is returned here.
* @overrides: A #Tld_table array of additional domain restriction
* structures that complement and supersede the built-in information.
*
* Test each of the code points in @in for whether or not they are
* allowed by the information in @overrides or by the built-in TLD
* restriction data. When data for the same TLD is available both
* internally and in @overrides, the information in @overrides takes
* precedence. If several entries for a specific TLD are found, the
* first one is used. If @overrides is %NULL, only the built-in
* information is used. The position of the first offending character
* is returned in @errpos.
*
* Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
* points are valid or when @tld is null, %TLD_INVALID if a
* character is not allowed, or additional error codes on general
* failure conditions.
*/
int
tld_check_4z (const uint32_t * in, size_t * errpos,
const Tld_table ** overrides)
{
const uint32_t *ipos = in;
if (!ipos)
return TLD_NODATA;
while (*ipos)
ipos++;
return tld_check_4 (in, ipos - in, errpos, overrides);
}
/**
* tld_check_8z - verify that characters are permitted
* @in: Zero-terminated UTF8 string to process.
* @errpos: Position of offending character is returned here.
* @overrides: A #Tld_table array of additional domain restriction
* structures that complement and supersede the built-in information.
*
* Test each of the characters in @in for whether or not they are
* allowed by the information in @overrides or by the built-in TLD
* restriction data. When data for the same TLD is available both
* internally and in @overrides, the information in @overrides takes
* precedence. If several entries for a specific TLD are found, the
* first one is used. If @overrides is %NULL, only the built-in
* information is used. The position of the first offending character
* is returned in @errpos. Note that the error position refers to the
* decoded character offset rather than the byte position in the
* string.
*
* Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
* characters are valid or when @tld is null, %TLD_INVALID if a
* character is not allowed, or additional error codes on general
* failure conditions.
*/
int
tld_check_8z (const char *in, size_t * errpos, const Tld_table ** overrides)
{
uint32_t *iucs;
size_t ilen;
int rc;
if (!in)
return TLD_NODATA;
iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen);
if (!iucs)
return TLD_MALLOC_ERROR;
rc = tld_check_4 (iucs, ilen, errpos, overrides);
free (iucs);
return rc;
}
/**
* tld_check_lz - verify that characters are permitted
* @in: Zero-terminated string in the current locales encoding to process.
* @errpos: Position of offending character is returned here.
* @overrides: A #Tld_table array of additional domain restriction
* structures that complement and supersede the built-in information.
*
* Test each of the characters in @in for whether or not they are
* allowed by the information in @overrides or by the built-in TLD
* restriction data. When data for the same TLD is available both
* internally and in @overrides, the information in @overrides takes
* precedence. If several entries for a specific TLD are found, the
* first one is used. If @overrides is %NULL, only the built-in
* information is used. The position of the first offending character
* is returned in @errpos. Note that the error position refers to the
* decoded character offset rather than the byte position in the
* string.
*
* Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
* characters are valid or when @tld is null, %TLD_INVALID if a
* character is not allowed, or additional error codes on general
* failure conditions.
*/
int
tld_check_lz (const char *in, size_t * errpos, const Tld_table ** overrides)
{
char *utf8;
int rc;
if (!in)
return TLD_NODATA;
utf8 = stringprep_locale_to_utf8 (in);
if (!utf8)
return TLD_ICONV_ERROR;
rc = tld_check_8z (utf8, errpos, overrides);
free (utf8);
return rc;
}
/**
* Tld_rc:
* @TLD_SUCCESS: Successful operation. This value is guaranteed to
* always be zero, the remaining ones are only guaranteed to hold
* non-zero values, for logical comparison purposes.
* @TLD_INVALID: Invalid character found.
* @TLD_NODATA: No input data was provided.
* @TLD_MALLOC_ERROR: Error during memory allocation.
* @TLD_ICONV_ERROR: Error during iconv string conversion.
* @TLD_NO_TLD: No top-level domain found in domain string.
*
* Enumerated return codes of the TLD checking functions.
* The value 0 is guaranteed to always correspond to success.
*/
syntax highlighted by Code2HTML, v. 0.9.1