From: Carl Worth Date: Mon, 19 Oct 2009 19:57:38 +0000 (-0700) Subject: date.c: Add new file directly from gmime2.4-2.4.6/gmime/gmime-utils.c X-Git-Tag: 0.1~840 X-Git-Url: https://git.notmuchmail.org/git?p=notmuch;a=commitdiff_plain;h=f5f8dcf2a00e3d78f862734bafeae0a7b25738c1 date.c: Add new file directly from gmime2.4-2.4.6/gmime/gmime-utils.c We're sucking in one gmime implementation file just to get the piece that parses an RFC 822 date, because I don't want to go through the pain of replicating that. --- diff --git a/date.c b/date.c new file mode 100644 index 00000000..6505e616 --- /dev/null +++ b/date.c @@ -0,0 +1,2509 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ +/* GMime + * Copyright (C) 2000-2009 Jeffrey Stedfast + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free + * Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + + +#ifdef HAVE_CONFIG_H +#include +#endif + +#define _GNU_SOURCE + +#include + +#include +#include +#include +#ifdef HAVE_SYS_PARAM_H +#include /* for MAXHOSTNAMELEN */ +#else +#define MAXHOSTNAMELEN 64 +#endif +#ifdef HAVE_UTSNAME_DOMAINNAME +#include /* for uname() */ +#endif +#include +#ifdef HAVE_UNISTD_H +#include /* Unix header for getpid() */ +#endif +#ifdef G_OS_WIN32 +#include +#include +#include +#define getpid() _getpid() +#endif +#ifdef HAVE_NETDB_H +#include +#endif +#include +#include + +#include "gmime-utils.h" +#include "gmime-table-private.h" +#include "gmime-parse-utils.h" +#include "gmime-part.h" +#include "gmime-charset.h" +#include "gmime-iconv.h" +#include "gmime-iconv-utils.h" + +#ifdef ENABLE_WARNINGS +#define w(x) x +#else +#define w(x) +#endif /* ENABLE_WARNINGS */ + +#define d(x) + + +/** + * SECTION: gmime-utils + * @title: gmime-utils + * @short_description: MIME utility functions + * @see_also: + * + * Utility functions to parse, encode and decode various MIME tokens + * and encodings. + **/ + +extern gboolean _g_mime_enable_rfc2047_workarounds (void); + +#define GMIME_FOLD_PREENCODED (GMIME_FOLD_LEN / 2) + +/* date parser macros */ +#define NUMERIC_CHARS "1234567890" +#define WEEKDAY_CHARS "SundayMondayTuesdayWednesdayThursdayFridaySaturday" +#define MONTH_CHARS "JanuaryFebruaryMarchAprilMayJuneJulyAugustSeptemberOctoberNovemberDecember" +#define TIMEZONE_ALPHA_CHARS "UTCGMTESTEDTCSTCDTMSTPSTPDTZAMNY()" +#define TIMEZONE_NUMERIC_CHARS "-+1234567890" +#define TIME_CHARS "1234567890:" + +#define DATE_TOKEN_NON_NUMERIC (1 << 0) +#define DATE_TOKEN_NON_WEEKDAY (1 << 1) +#define DATE_TOKEN_NON_MONTH (1 << 2) +#define DATE_TOKEN_NON_TIME (1 << 3) +#define DATE_TOKEN_HAS_COLON (1 << 4) +#define DATE_TOKEN_NON_TIMEZONE_ALPHA (1 << 5) +#define DATE_TOKEN_NON_TIMEZONE_NUMERIC (1 << 6) +#define DATE_TOKEN_HAS_SIGN (1 << 7) + +static unsigned char tohex[16] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' +}; + +static unsigned char gmime_datetok_table[256] = { + 128,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111, + 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111, + 111,111,111,111,111,111,111,111, 79, 79,111,175,111,175,111,111, + 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,119,111,111,111,111,111, + 111, 75,111, 79, 75, 79,105, 79,111,111,107,111,111, 73, 75,107, + 79,111,111, 73, 77, 79,111,109,111, 79, 79,111,111,111,111,111, + 111,105,107,107,109,105,111,107,105,105,111,111,107,107,105,105, + 107,111,105,105,105,105,107,111,111,105,111,111,111,111,111,111, + 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111, + 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111, + 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111, + 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111, + 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111, + 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111, + 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111, + 111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111, +}; + +/* hrm, is there a library for this shit? */ +static struct { + char *name; + int offset; +} tz_offsets [] = { + { "UT", 0 }, + { "GMT", 0 }, + { "EST", -500 }, /* these are all US timezones. bloody yanks */ + { "EDT", -400 }, + { "CST", -600 }, + { "CDT", -500 }, + { "MST", -700 }, + { "MDT", -600 }, + { "PST", -800 }, + { "PDT", -700 }, + { "Z", 0 }, + { "A", -100 }, + { "M", -1200 }, + { "N", 100 }, + { "Y", 1200 }, +}; + +static char *tm_months[] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" +}; + +static char *tm_days[] = { + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" +}; + + +/** + * g_mime_utils_header_format_date: + * @date: time_t date representation + * @tz_offset: Timezone offset + * + * Allocates a string buffer containing the rfc822 formatted date + * string represented by @time and @tz_offset. + * + * Returns: a valid string representation of the date. + **/ +char * +g_mime_utils_header_format_date (time_t date, int tz_offset) +{ + struct tm tm; + + date += ((tz_offset / 100) * (60 * 60)) + (tz_offset % 100) * 60; + +#if defined (HAVE_GMTIME_R) + gmtime_r (&date, &tm); +#elif defined (HAVE_GMTIME_S) + gmtime_s (&tm, &date); +#else + memcpy (&tm, gmtime (&date), sizeof (tm)); +#endif + + return g_strdup_printf ("%s, %02d %s %04d %02d:%02d:%02d %+05d", + tm_days[tm.tm_wday], tm.tm_mday, + tm_months[tm.tm_mon], + tm.tm_year + 1900, + tm.tm_hour, tm.tm_min, tm.tm_sec, + tz_offset); +} + +/* This is where it gets ugly... */ + +typedef struct _date_token { + struct _date_token *next; + unsigned char mask; + const char *start; + size_t len; +} date_token; + +#define date_token_free(tok) g_slice_free (date_token, tok) +#define date_token_new() g_slice_new (date_token) + +static date_token * +datetok (const char *date) +{ + date_token *tokens = NULL, *token, *tail = (date_token *) &tokens; + const char *start, *end; + unsigned char mask; + + start = date; + while (*start) { + /* kill leading whitespace */ + while (*start == ' ' || *start == '\t') + start++; + + if (*start == '\0') + break; + + mask = gmime_datetok_table[(unsigned char) *start]; + + /* find the end of this token */ + end = start + 1; + while (*end && !strchr ("-/,\t\r\n ", *end)) + mask |= gmime_datetok_table[(unsigned char) *end++]; + + if (end != start) { + token = date_token_new (); + token->next = NULL; + token->start = start; + token->len = end - start; + token->mask = mask; + + tail->next = token; + tail = token; + } + + if (*end) + start = end + 1; + else + break; + } + + return tokens; +} + +static int +decode_int (const char *in, size_t inlen) +{ + register const char *inptr; + int sign = 1, val = 0; + const char *inend; + + inptr = in; + inend = in + inlen; + + if (*inptr == '-') { + sign = -1; + inptr++; + } else if (*inptr == '+') + inptr++; + + for ( ; inptr < inend; inptr++) { + if (!(*inptr >= '0' && *inptr <= '9')) + return -1; + else + val = (val * 10) + (*inptr - '0'); + } + + val *= sign; + + return val; +} + +#if 0 +static int +get_days_in_month (int month, int year) +{ + switch (month) { + case 1: + case 3: + case 5: + case 7: + case 8: + case 10: + case 12: + return 31; + case 4: + case 6: + case 9: + case 11: + return 30; + case 2: + if (g_date_is_leap_year (year)) + return 29; + else + return 28; + default: + return 0; + } +} +#endif + +static int +get_wday (const char *in, size_t inlen) +{ + int wday; + + g_return_val_if_fail (in != NULL, -1); + + if (inlen < 3) + return -1; + + for (wday = 0; wday < 7; wday++) { + if (!g_ascii_strncasecmp (in, tm_days[wday], 3)) + return wday; + } + + return -1; /* unknown week day */ +} + +static int +get_mday (const char *in, size_t inlen) +{ + int mday; + + g_return_val_if_fail (in != NULL, -1); + + mday = decode_int (in, inlen); + + if (mday < 0 || mday > 31) + mday = -1; + + return mday; +} + +static int +get_month (const char *in, size_t inlen) +{ + int i; + + g_return_val_if_fail (in != NULL, -1); + + if (inlen < 3) + return -1; + + for (i = 0; i < 12; i++) { + if (!g_ascii_strncasecmp (in, tm_months[i], 3)) + return i; + } + + return -1; /* unknown month */ +} + +static int +get_year (const char *in, size_t inlen) +{ + int year; + + g_return_val_if_fail (in != NULL, -1); + + if ((year = decode_int (in, inlen)) == -1) + return -1; + + if (year < 100) + year += (year < 70) ? 2000 : 1900; + + if (year < 1969) + return -1; + + return year; +} + +static gboolean +get_time (const char *in, size_t inlen, int *hour, int *min, int *sec) +{ + register const char *inptr; + int *val, colons = 0; + const char *inend; + + *hour = *min = *sec = 0; + + inend = in + inlen; + val = hour; + for (inptr = in; inptr < inend; inptr++) { + if (*inptr == ':') { + colons++; + switch (colons) { + case 1: + val = min; + break; + case 2: + val = sec; + break; + default: + return FALSE; + } + } else if (!(*inptr >= '0' && *inptr <= '9')) + return FALSE; + else + *val = (*val * 10) + (*inptr - '0'); + } + + return TRUE; +} + +static int +get_tzone (date_token **token) +{ + const char *inptr, *inend; + size_t inlen; + int i, t; + + for (i = 0; *token && i < 2; *token = (*token)->next, i++) { + inptr = (*token)->start; + inlen = (*token)->len; + inend = inptr + inlen; + + if (*inptr == '+' || *inptr == '-') { + return decode_int (inptr, inlen); + } else { + if (*inptr == '(') { + inptr++; + if (*(inend - 1) == ')') + inlen -= 2; + else + inlen--; + } + + for (t = 0; t < 15; t++) { + size_t len = strlen (tz_offsets[t].name); + + if (len != inlen) + continue; + + if (!strncmp (inptr, tz_offsets[t].name, len)) + return tz_offsets[t].offset; + } + } + } + + return -1; +} + +static time_t +mktime_utc (struct tm *tm) +{ + time_t tt; + long tz; + + tm->tm_isdst = -1; + tt = mktime (tm); + +#if defined (G_OS_WIN32) + _get_timezone (&tz); + if (tm->tm_isdst > 0) { + int dst; + + _get_dstbias (&dst); + tz += dst; + } +#elif defined (HAVE_TM_GMTOFF) + tz = -tm->tm_gmtoff; +#elif defined (HAVE_TIMEZONE) + if (tm->tm_isdst > 0) { +#if defined (HAVE_ALTZONE) + tz = altzone; +#else /* !defined (HAVE_ALTZONE) */ + tz = (timezone - 3600); +#endif + } else { + tz = timezone; + } +#elif defined (HAVE__TIMEZONE) + tz = _timezone; +#else +#error Neither HAVE_TIMEZONE nor HAVE_TM_GMTOFF defined. Rerun autoheader, autoconf, etc. +#endif + + return tt - tz; +} + +static time_t +parse_rfc822_date (date_token *tokens, int *tzone) +{ + int hour, min, sec, offset, n; + date_token *token; + struct tm tm; + time_t t; + + g_return_val_if_fail (tokens != NULL, (time_t) 0); + + token = tokens; + + memset ((void *) &tm, 0, sizeof (struct tm)); + + if ((n = get_wday (token->start, token->len)) != -1) { + /* not all dates may have this... */ + tm.tm_wday = n; + token = token->next; + } + + /* get the mday */ + if (!token || (n = get_mday (token->start, token->len)) == -1) + return (time_t) 0; + + tm.tm_mday = n; + token = token->next; + + /* get the month */ + if (!token || (n = get_month (token->start, token->len)) == -1) + return (time_t) 0; + + tm.tm_mon = n; + token = token->next; + + /* get the year */ + if (!token || (n = get_year (token->start, token->len)) == -1) + return (time_t) 0; + + tm.tm_year = n - 1900; + token = token->next; + + /* get the hour/min/sec */ + if (!token || !get_time (token->start, token->len, &hour, &min, &sec)) + return (time_t) 0; + + tm.tm_hour = hour; + tm.tm_min = min; + tm.tm_sec = sec; + token = token->next; + + /* get the timezone */ + if (!token || (n = get_tzone (&token)) == -1) { + /* I guess we assume tz is GMT? */ + offset = 0; + } else { + offset = n; + } + + t = mktime_utc (&tm); + + /* t is now GMT of the time we want, but not offset by the timezone ... */ + + /* this should convert the time to the GMT equiv time */ + t -= ((offset / 100) * 60 * 60) + (offset % 100) * 60; + + if (tzone) + *tzone = offset; + + return t; +} + + +#define date_token_mask(t) (((date_token *) t)->mask) +#define is_numeric(t) ((date_token_mask (t) & DATE_TOKEN_NON_NUMERIC) == 0) +#define is_weekday(t) ((date_token_mask (t) & DATE_TOKEN_NON_WEEKDAY) == 0) +#define is_month(t) ((date_token_mask (t) & DATE_TOKEN_NON_MONTH) == 0) +#define is_time(t) (((date_token_mask (t) & DATE_TOKEN_NON_TIME) == 0) && (date_token_mask (t) & DATE_TOKEN_HAS_COLON)) +#define is_tzone_alpha(t) ((date_token_mask (t) & DATE_TOKEN_NON_TIMEZONE_ALPHA) == 0) +#define is_tzone_numeric(t) (((date_token_mask (t) & DATE_TOKEN_NON_TIMEZONE_NUMERIC) == 0) && (date_token_mask (t) & DATE_TOKEN_HAS_SIGN)) +#define is_tzone(t) (is_tzone_alpha (t) || is_tzone_numeric (t)) + +static time_t +parse_broken_date (date_token *tokens, int *tzone) +{ + gboolean got_wday, got_month, got_tzone; + int hour, min, sec, offset, n; + date_token *token; + struct tm tm; + time_t t; + + memset ((void *) &tm, 0, sizeof (struct tm)); + got_wday = got_month = got_tzone = FALSE; + offset = 0; + + token = tokens; + while (token) { + if (is_weekday (token) && !got_wday) { + if ((n = get_wday (token->start, token->len)) != -1) { + d(printf ("weekday; ")); + got_wday = TRUE; + tm.tm_wday = n; + goto next; + } + } + + if (is_month (token) && !got_month) { + if ((n = get_month (token->start, token->len)) != -1) { + d(printf ("month; ")); + got_month = TRUE; + tm.tm_mon = n; + goto next; + } + } + + if (is_time (token) && !tm.tm_hour && !tm.tm_min && !tm.tm_sec) { + if (get_time (token->start, token->len, &hour, &min, &sec)) { + d(printf ("time; ")); + tm.tm_hour = hour; + tm.tm_min = min; + tm.tm_sec = sec; + goto next; + } + } + + if (is_tzone (token) && !got_tzone) { + date_token *t = token; + + if ((n = get_tzone (&t)) != -1) { + d(printf ("tzone; ")); + got_tzone = TRUE; + offset = n; + goto next; + } + } + + if (is_numeric (token)) { + if (token->len == 4 && !tm.tm_year) { + if ((n = get_year (token->start, token->len)) != -1) { + d(printf ("year; ")); + tm.tm_year = n - 1900; + goto next; + } + } else { + /* Note: assumes MM-DD-YY ordering if '0 < MM < 12' holds true */ + if (!got_month && token->next && is_numeric (token->next)) { + if ((n = decode_int (token->start, token->len)) > 12) { + goto mday; + } else if (n > 0) { + d(printf ("mon; ")); + got_month = TRUE; + tm.tm_mon = n - 1; + } + goto next; + } else if (!tm.tm_mday && (n = get_mday (token->start, token->len)) != -1) { + mday: + d(printf ("mday; ")); + tm.tm_mday = n; + goto next; + } else if (!tm.tm_year) { + if ((n = get_year (token->start, token->len)) != -1) { + d(printf ("2-digit year; ")); + tm.tm_year = n - 1900; + } + goto next; + } + } + } + + d(printf ("???; ")); + + next: + + token = token->next; + } + + d(printf ("\n")); + + t = mktime_utc (&tm); + + /* t is now GMT of the time we want, but not offset by the timezone ... */ + + /* this should convert the time to the GMT equiv time */ + t -= ((offset / 100) * 60 * 60) + (offset % 100) * 60; + + if (tzone) + *tzone = offset; + + return t; +} + +#if 0 +static void +gmime_datetok_table_init (void) +{ + int i; + + memset (gmime_datetok_table, 0, sizeof (gmime_datetok_table)); + + for (i = 0; i < 256; i++) { + if (!strchr (NUMERIC_CHARS, i)) + gmime_datetok_table[i] |= DATE_TOKEN_NON_NUMERIC; + + if (!strchr (WEEKDAY_CHARS, i)) + gmime_datetok_table[i] |= DATE_TOKEN_NON_WEEKDAY; + + if (!strchr (MONTH_CHARS, i)) + gmime_datetok_table[i] |= DATE_TOKEN_NON_MONTH; + + if (!strchr (TIME_CHARS, i)) + gmime_datetok_table[i] |= DATE_TOKEN_NON_TIME; + + if (!strchr (TIMEZONE_ALPHA_CHARS, i)) + gmime_datetok_table[i] |= DATE_TOKEN_NON_TIMEZONE_ALPHA; + + if (!strchr (TIMEZONE_NUMERIC_CHARS, i)) + gmime_datetok_table[i] |= DATE_TOKEN_NON_TIMEZONE_NUMERIC; + + if (((char) i) == ':') + gmime_datetok_table[i] |= DATE_TOKEN_HAS_COLON; + + if (strchr ("+-", i)) + gmime_datetok_table[i] |= DATE_TOKEN_HAS_SIGN; + } + + printf ("static unsigned char gmime_datetok_table[256] = {"); + for (i = 0; i < 256; i++) { + if (i % 16 == 0) + printf ("\n\t"); + printf ("%3d,", gmime_datetok_table[i]); + } + printf ("\n};\n"); +} +#endif + + +/** + * g_mime_utils_header_decode_date: + * @str: input date string + * @tz_offset: timezone offset + * + * Decodes the rfc822 date string and saves the GMT offset into + * @tz_offset if non-NULL. + * + * Returns: the time_t representation of the date string specified by + * @str or (time_t) %0 on error. If @tz_offset is non-NULL, the value + * of the timezone offset will be stored. + **/ +time_t +g_mime_utils_header_decode_date (const char *str, int *tz_offset) +{ + date_token *token, *tokens; + time_t date; + + if (!(tokens = datetok (str))) { + if (tz_offset) + *tz_offset = 0; + + return (time_t) 0; + } + + if (!(date = parse_rfc822_date (tokens, tz_offset))) + date = parse_broken_date (tokens, tz_offset); + + /* cleanup */ + while (tokens) { + token = tokens; + tokens = tokens->next; + date_token_free (token); + } + + return date; +} + + +/** + * g_mime_utils_generate_message_id: + * @fqdn: Fully qualified domain name + * + * Generates a unique Message-Id. + * + * Returns: a unique string in an addr-spec format suitable for use as + * a Message-Id. + **/ +char * +g_mime_utils_generate_message_id (const char *fqdn) +{ +#ifdef G_THREADS_ENABLED + static GStaticMutex mutex = G_STATIC_MUTEX_INIT; +#define MUTEX_LOCK() g_static_mutex_lock (&mutex) +#define MUTEX_UNLOCK() g_static_mutex_unlock (&mutex) +#else +#define MUTEX_LOCK() +#define MUTEX_UNLOCK() +#endif + static unsigned long int count = 0; + const char *hostname = NULL; + char *name = NULL; + char *msgid; + + if (!fqdn) { +#ifdef HAVE_UTSNAME_DOMAINNAME + struct utsname unam; + + uname (&unam); + + hostname = unam.nodename; + + if (unam.domainname[0]) + name = g_strdup_printf ("%s.%s", hostname, unam.domainname); +#else /* ! HAVE_UTSNAME_DOMAINNAME */ + char host[MAXHOSTNAMELEN + 1]; + +#ifdef HAVE_GETHOSTNAME + host[MAXHOSTNAMELEN] = '\0'; + if (gethostname (host, MAXHOSTNAMELEN) == 0) { +#ifdef HAVE_GETDOMAINNAME + size_t domainlen = MAXHOSTNAMELEN; + char *domain; + int rv; + + domain = g_malloc (domainlen); + + while ((rv = getdomainname (domain, domainlen)) == -1 && errno == EINVAL) { + domainlen += MAXHOSTNAMELEN; + domain = g_realloc (domain, domainlen); + } + + if (rv == 0 && domain[0]) { + if (host[0]) { + name = g_strdup_printf ("%s.%s", host, domain); + g_free (domain); + } else { + name = domain; + } + } +#endif /* HAVE_GETDOMAINNAME */ + } else { + host[0] = '\0'; + } +#endif /* HAVE_GETHOSTNAME */ + hostname = host; +#endif /* HAVE_UTSNAME_DOMAINNAME */ + +#ifdef HAVE_GETADDRINFO + if (!name && hostname[0]) { + /* we weren't able to get a domain name */ + struct addrinfo hints, *res; + + memset (&hints, 0, sizeof (hints)); + hints.ai_flags = AI_CANONNAME; + + if (getaddrinfo (hostname, NULL, &hints, &res) == 0) { + name = g_strdup (res->ai_canonname); + freeaddrinfo (res); + } + } +#endif /* HAVE_GETADDRINFO */ + + fqdn = name != NULL ? name : (hostname[0] ? hostname : "localhost.localdomain"); + } + + MUTEX_LOCK (); + msgid = g_strdup_printf ("%lu.%lu.%lu@%s", (unsigned long int) time (NULL), + (unsigned long int) getpid (), count++, fqdn); + MUTEX_UNLOCK (); + + g_free (name); + + return msgid; +} + +static char * +decode_addrspec (const char **in) +{ + const char *word, *inptr; + GString *addrspec; + char *str; + + decode_lwsp (in); + inptr = *in; + + if (!(word = decode_word (&inptr))) { + w(g_warning ("No local-part in addr-spec: %s", *in)); + return NULL; + } + + addrspec = g_string_new (""); + g_string_append_len (addrspec, word, (size_t) (inptr - word)); + + /* get the rest of the local-part */ + decode_lwsp (&inptr); + while (*inptr == '.') { + g_string_append_c (addrspec, *inptr++); + if ((word = decode_word (&inptr))) { + g_string_append_len (addrspec, word, (size_t) (inptr - word)); + decode_lwsp (&inptr); + } else { + w(g_warning ("Invalid local-part in addr-spec: %s", *in)); + goto exception; + } + } + + /* we should be at the '@' now... */ + if (*inptr++ != '@') { + w(g_warning ("Invalid addr-spec; missing '@': %s", *in)); + goto exception; + } + + g_string_append_c (addrspec, '@'); + if (!decode_domain (&inptr, addrspec)) { + w(g_warning ("No domain in addr-spec: %s", *in)); + goto exception; + } + + str = addrspec->str; + g_string_free (addrspec, FALSE); + + *in = inptr; + + return str; + + exception: + + g_string_free (addrspec, TRUE); + + return NULL; +} + +static char * +decode_msgid (const char **in) +{ + const char *inptr = *in; + char *msgid = NULL; + + decode_lwsp (&inptr); + if (*inptr != '<') { + w(g_warning ("Invalid msg-id; missing '<': %s", *in)); + } else { + inptr++; + } + + decode_lwsp (&inptr); + if ((msgid = decode_addrspec (&inptr))) { + decode_lwsp (&inptr); + if (*inptr != '>') { + w(g_warning ("Invalid msg-id; missing '>': %s", *in)); + } else { + inptr++; + } + + *in = inptr; + } else { + w(g_warning ("Invalid msg-id; missing addr-spec: %s", *in)); + *in = inptr; + while (*inptr && *inptr != '>') + inptr++; + + msgid = g_strndup (*in, (size_t) (inptr - *in)); + *in = inptr; + } + + return msgid; +} + + +/** + * g_mime_utils_decode_message_id: + * @message_id: string containing a message-id + * + * Decodes a msg-id as defined by rfc822. + * + * Returns: the addr-spec portion of the msg-id. + **/ +char * +g_mime_utils_decode_message_id (const char *message_id) +{ + g_return_val_if_fail (message_id != NULL, NULL); + + return decode_msgid (&message_id); +} + + +/** + * g_mime_references_decode: + * @text: string containing a list of msg-ids + * + * Decodes a list of msg-ids as in the References and/or In-Reply-To + * headers defined in rfc822. + * + * Returns: a list of referenced msg-ids. + **/ +GMimeReferences * +g_mime_references_decode (const char *text) +{ + GMimeReferences *refs, *tail, *ref; + const char *word, *inptr = text; + char *msgid; + + g_return_val_if_fail (text != NULL, NULL); + + refs = NULL; + tail = (GMimeReferences *) &refs; + + while (*inptr) { + decode_lwsp (&inptr); + if (*inptr == '<') { + /* looks like a msg-id */ + if ((msgid = decode_msgid (&inptr))) { + ref = g_new (GMimeReferences, 1); + ref->next = NULL; + ref->msgid = msgid; + tail->next = ref; + tail = ref; + } else { + w(g_warning ("Invalid References header: %s", inptr)); + break; + } + } else if (*inptr) { + /* looks like part of a phrase */ + if (!(word = decode_word (&inptr))) { + w(g_warning ("Invalid References header: %s", inptr)); + break; + } + } + } + + return refs; +} + + +/** + * g_mime_references_append: + * @refs: the address of a #GMimeReferences list + * @msgid: a message-id string + * + * Appends a reference to msgid to the list of references. + **/ +void +g_mime_references_append (GMimeReferences **refs, const char *msgid) +{ + GMimeReferences *ref; + + g_return_if_fail (refs != NULL); + g_return_if_fail (msgid != NULL); + + ref = (GMimeReferences *) refs; + while (ref->next) + ref = ref->next; + + ref->next = g_new (GMimeReferences, 1); + ref->next->msgid = g_strdup (msgid); + ref->next->next = NULL; +} + + +/** + * g_mime_references_free: + * @refs: a #GMimeReferences list + * + * Frees the #GMimeReferences list. + **/ +void +g_mime_references_free (GMimeReferences *refs) +{ + GMimeReferences *ref, *next; + + ref = refs; + while (ref) { + next = ref->next; + g_free (ref->msgid); + g_free (ref); + ref = next; + } +} + + +/** + * g_mime_references_clear: + * @refs: address of a #GMimeReferences list + * + * Clears the #GMimeReferences list and resets it to %NULL. + **/ +void +g_mime_references_clear (GMimeReferences **refs) +{ + g_return_if_fail (refs != NULL); + + g_mime_references_free (*refs); + *refs = NULL; +} + + +/** + * g_mime_references_get_next: + * @ref: a #GMimeReferences list + * + * Advances to the next reference node in the #GMimeReferences list. + * + * Returns: the next reference node in the #GMimeReferences list. + **/ +const GMimeReferences * +g_mime_references_get_next (const GMimeReferences *ref) +{ + return ref ? ref->next : NULL; +} + + +/** + * g_mime_references_get_message_id: + * @ref: a #GMimeReferences list + * + * Gets the Message-Id reference from the #GMimeReferences node. + * + * Returns: the Message-Id reference from the #GMimeReferences node. + **/ +const char * +g_mime_references_get_message_id (const GMimeReferences *ref) +{ + return ref ? ref->msgid : NULL; +} + + +static gboolean +is_rfc2047_token (const char *inptr, size_t len) +{ + if (len < 8 || strncmp (inptr, "=?", 2) != 0 || strncmp (inptr + len - 2, "?=", 2) != 0) + return FALSE; + + inptr += 2; + len -= 2; + + /* skip past the charset */ + while (*inptr != '?' && len > 0) { + inptr++; + len--; + } + + if (*inptr != '?' || len < 4) + return FALSE; + + if (inptr[1] != 'q' && inptr[1] != 'Q' && inptr[1] != 'b' && inptr[1] != 'B') + return FALSE; + + inptr += 2; + len -= 2; + + if (*inptr != '?') + return FALSE; + + return TRUE; +} + +static char * +header_fold (const char *in, gboolean structured) +{ + gboolean last_was_lwsp = FALSE; + register const char *inptr; + size_t len, outlen, i; + size_t fieldlen; + GString *out; + char *ret; + + inptr = in; + len = strlen (in); + if (len <= GMIME_FOLD_LEN + 1) + return g_strdup (in); + + out = g_string_new (""); + fieldlen = strcspn (inptr, ": \t\n"); + g_string_append_len (out, inptr, fieldlen); + outlen = fieldlen; + inptr += fieldlen; + + while (*inptr && *inptr != '\n') { + len = strcspn (inptr, " \t\n"); + + if (len > 1 && outlen + len > GMIME_FOLD_LEN) { + if (outlen > 1 && out->len > fieldlen + 2) { + if (last_was_lwsp) { + if (structured) + out->str[out->len - 1] = '\t'; + + g_string_insert_c (out, out->len - 1, '\n'); + } else + g_string_append (out, "\n\t"); + outlen = 1; + } + + if (!structured && !is_rfc2047_token (inptr, len)) { + /* check for very long words, just cut them up */ + while (outlen + len > GMIME_FOLD_LEN) { + for (i = 0; i < GMIME_FOLD_LEN - outlen; i++) + g_string_append_c (out, inptr[i]); + inptr += GMIME_FOLD_LEN - outlen; + len -= GMIME_FOLD_LEN - outlen; + g_string_append (out, "\n\t"); + outlen = 1; + } + } else { + g_string_append_len (out, inptr, len); + outlen += len; + inptr += len; + } + last_was_lwsp = FALSE; + } else if (len > 0) { + g_string_append_len (out, inptr, len); + outlen += len; + inptr += len; + last_was_lwsp = FALSE; + } else { + last_was_lwsp = TRUE; + if (*inptr == '\t') { + /* tabs are a good place to fold, odds + are that this is where the previous + mailer folded it */ + g_string_append (out, "\n\t"); + outlen = 1; + while (is_blank (*inptr)) + inptr++; + } else { + g_string_append_c (out, *inptr++); + outlen++; + } + } + } + + if (*inptr == '\n' && out->str[out->len - 1] != '\n') + g_string_append_c (out, '\n'); + + ret = out->str; + g_string_free (out, FALSE); + + return ret; +} + + +/** + * g_mime_utils_structured_header_fold: + * @str: input string + * + * Folds a structured header according to the rules in rfc822. + * + * Returns: an allocated string containing the folded header. + **/ +char * +g_mime_utils_structured_header_fold (const char *str) +{ + return header_fold (str, TRUE); +} + + +/** + * g_mime_utils_unstructured_header_fold: + * @str: input string + * + * Folds an unstructured header according to the rules in rfc822. + * + * Returns: an allocated string containing the folded header. + **/ +char * +g_mime_utils_unstructured_header_fold (const char *str) +{ + return header_fold (str, FALSE); +} + + +/** + * g_mime_utils_header_fold: + * @str: input string + * + * Folds a structured header according to the rules in rfc822. + * + * Returns: an allocated string containing the folded header. + **/ +char * +g_mime_utils_header_fold (const char *str) +{ + return header_fold (str, TRUE); +} + + +/** + * g_mime_utils_header_printf: + * @format: string format + * @Varargs: arguments + * + * Allocates a buffer containing a formatted header specified by the + * @Varargs. + * + * Returns: an allocated string containing the folded header specified + * by @format and the following arguments. + **/ +char * +g_mime_utils_header_printf (const char *format, ...) +{ + char *buf, *ret; + va_list ap; + + va_start (ap, format); + buf = g_strdup_vprintf (format, ap); + va_end (ap); + + ret = header_fold (buf, TRUE); + g_free (buf); + + return ret; +} + +static gboolean +need_quotes (const char *string) +{ + gboolean quoted = FALSE; + const char *inptr; + + inptr = string; + + while (*inptr) { + if (*inptr == '\\') + inptr++; + else if (*inptr == '"') + quoted = !quoted; + else if (!quoted && (is_tspecial (*inptr) || *inptr == '.')) + return TRUE; + + if (*inptr) + inptr++; + } + + return FALSE; +} + +/** + * g_mime_utils_quote_string: + * @str: input string + * + * Quotes @string as needed according to the rules in rfc2045. + * + * Returns: an allocated string containing the escaped and quoted (if + * needed to be) input string. The decision to quote the string is + * based on whether or not the input string contains any 'tspecials' + * as defined by rfc2045. + **/ +char * +g_mime_utils_quote_string (const char *str) +{ + gboolean quote; + const char *c; + char *qstring; + GString *out; + + out = g_string_new (""); + + if ((quote = need_quotes (str))) + g_string_append_c (out, '"'); + + for (c = str; *c; c++) { + if ((*c == '"' && quote) || *c == '\\') + g_string_append_c (out, '\\'); + + g_string_append_c (out, *c); + } + + if (quote) + g_string_append_c (out, '"'); + + qstring = out->str; + g_string_free (out, FALSE); + + return qstring; +} + + +/** + * g_mime_utils_unquote_string: + * @str: input string + * + * Unquotes and unescapes a string. + **/ +void +g_mime_utils_unquote_string (char *str) +{ + /* if the string is quoted, unquote it */ + register char *inptr = str; + int escaped = FALSE; + int quoted = FALSE; + + if (!str) + return; + + while (*inptr) { + if (*inptr == '\\') { + if (escaped) + *str++ = *inptr++; + else + inptr++; + escaped = !escaped; + } else if (*inptr == '"') { + if (escaped) { + *str++ = *inptr++; + escaped = FALSE; + } else { + quoted = !quoted; + inptr++; + } + } else { + *str++ = *inptr++; + escaped = FALSE; + } + } + + *str = '\0'; +} + + +/** + * g_mime_utils_text_is_8bit: + * @text: text to check for 8bit chars + * @len: text length + * + * Determines if @text contains 8bit characters within the first @len + * bytes. + * + * Returns: %TRUE if the text contains 8bit characters or %FALSE + * otherwise. + **/ +gboolean +g_mime_utils_text_is_8bit (const unsigned char *text, size_t len) +{ + register const unsigned char *inptr; + const unsigned char *inend; + + g_return_val_if_fail (text != NULL, FALSE); + + inend = text + len; + for (inptr = text; *inptr && inptr < inend; inptr++) + if (*inptr > (unsigned char) 127) + return TRUE; + + return FALSE; +} + + +/** + * g_mime_utils_best_encoding: + * @text: text to encode + * @len: text length + * + * Determines the best content encoding for the first @len bytes of + * @text. + * + * Returns: a #GMimeContentEncoding that is determined to be the best + * encoding type for the specified block of text. ("best" in this + * particular case means smallest output size) + **/ +GMimeContentEncoding +g_mime_utils_best_encoding (const unsigned char *text, size_t len) +{ + const unsigned char *ch, *inend; + size_t count = 0; + + inend = text + len; + for (ch = text; ch < inend; ch++) + if (*ch > (unsigned char) 127) + count++; + + if ((float) count <= len * 0.17) + return GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE; + else + return GMIME_CONTENT_ENCODING_BASE64; +} + + +/** + * charset_convert: + * @cd: iconv converter + * @inbuf: input text buffer to convert + * @inleft: length of the input buffer + * @outp: pointer to output buffer + * @outlenp: pointer to output buffer length + * @ninval: the number of invalid bytes in @inbuf + * + * Converts the input buffer from one charset to another using the + * @cd. On completion, @outp will point to the output buffer + * containing the converted text (nul-terminated), @outlenp will be + * the size of the @outp buffer (note: not the strlen() of @outp) and + * @ninval will contain the number of bytes which could not be + * converted. + * + * Bytes which cannot be converted from @inbuf will appear as '?' + * characters in the output buffer. + * + * If *@outp is non-NULL, then it is assumed that it points to a + * pre-allocated buffer of length *@outlenp. This is done so that the + * same output buffer can be reused multiple times. + * + * Returns: the string length of the output buffer. + **/ +static size_t +charset_convert (iconv_t cd, const char *inbuf, size_t inleft, char **outp, size_t *outlenp, size_t *ninval) +{ + size_t outlen, outleft, rc, n = 0; + char *outbuf, *out; + + if (*outp == NULL) { + outleft = outlen = (inleft * 2) + 16; + outbuf = out = g_malloc (outlen + 1); + } else { + outleft = outlen = *outlenp; + outbuf = out = *outp; + } + + do { + rc = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft); + if (rc == (size_t) -1) { + if (errno == EINVAL) { + /* incomplete sequence at the end of the input buffer */ + n += inleft; + break; + } + +#ifdef G_OS_WIN32 + /* seems that GnuWin32's libiconv 1.9 does not set errno in + * the E2BIG case, so we have to fake it */ + if (outleft <= inleft) + errno = E2BIG; +#endif + + if (errno == E2BIG) { + /* need to grow the output buffer */ + outlen += (inleft * 2) + 16; + rc = (size_t) (outbuf - out); + out = g_realloc (out, outlen + 1); + outleft = outlen - rc; + outbuf = out + rc; + } else { + /* invalid byte(-sequence) in the input buffer */ + *outbuf++ = '?'; + outleft--; + inleft--; + inbuf++; + n++; + } + } + } while (inleft > 0); + + iconv (cd, NULL, NULL, &outbuf, &outleft); + *outbuf++ = '\0'; + + *outlenp = outlen; + *outp = out; + *ninval = n; + + return (outbuf - out); +} + + +#define USER_CHARSETS_INCLUDE_UTF8 (1 << 0) +#define USER_CHARSETS_INCLUDE_LOCALE (1 << 1) + + +/** + * g_mime_utils_decode_8bit: + * @text: input text in unknown 8bit/multibyte character set + * @len: input text length + * + * Attempts to convert text in an unknown 8bit/multibyte charset into + * UTF-8 by finding the charset which will convert the most bytes into + * valid UTF-8 characters as possible. If no exact match can be found, + * it will choose the best match and convert invalid byte sequences + * into question-marks (?) in the returned string buffer. + * + * Returns: a UTF-8 string representation of @text. + **/ +char * +g_mime_utils_decode_8bit (const char *text, size_t len) +{ + const char **charsets, **user_charsets, *locale, *best; + size_t outleft, outlen, min, ninval; + unsigned int included = 0; + iconv_t cd; + char *out; + int i = 0; + + g_return_val_if_fail (text != NULL, NULL); + + locale = g_mime_locale_charset (); + if (locale && !g_ascii_strcasecmp (locale, "UTF-8")) + included |= USER_CHARSETS_INCLUDE_LOCALE; + + if ((user_charsets = g_mime_user_charsets ())) { + while (user_charsets[i]) + i++; + } + + charsets = g_alloca (sizeof (char *) * (i + 3)); + i = 0; + + if (user_charsets) { + while (user_charsets[i]) { + /* keep a record of whether or not the user-supplied + * charsets include UTF-8 and/or the default fallback + * charset so that we avoid doubling our efforts for + * these 2 charsets. We could have used a hash table + * to keep track of unique charsets, but we can + * (hopefully) assume that user_charsets is a unique + * list of charsets with no duplicates. */ + if (!g_ascii_strcasecmp (user_charsets[i], "UTF-8")) + included |= USER_CHARSETS_INCLUDE_UTF8; + + if (locale && !g_ascii_strcasecmp (user_charsets[i], locale)) + included |= USER_CHARSETS_INCLUDE_LOCALE; + + charsets[i] = user_charsets[i]; + i++; + } + } + + if (!(included & USER_CHARSETS_INCLUDE_UTF8)) + charsets[i++] = "UTF-8"; + + if (!(included & USER_CHARSETS_INCLUDE_LOCALE)) + charsets[i++] = locale; + + charsets[i] = NULL; + + min = len; + best = charsets[0]; + + outleft = (len * 2) + 16; + out = g_malloc (outleft + 1); + + for (i = 0; charsets[i]; i++) { + if ((cd = g_mime_iconv_open ("UTF-8", charsets[i])) == (iconv_t) -1) + continue; + + outlen = charset_convert (cd, text, len, &out, &outleft, &ninval); + + g_mime_iconv_close (cd); + + if (ninval == 0) + return g_realloc (out, outlen + 1); + + if (ninval < min) { + best = charsets[i]; + min = ninval; + } + } + + /* if we get here, then none of the charsets fit the 8bit text flawlessly... + * try to find the one that fit the best and use that to convert what we can, + * replacing any byte we can't convert with a '?' */ + + if ((cd = g_mime_iconv_open ("UTF-8", best)) == (iconv_t) -1) { + /* this shouldn't happen... but if we are here, then + * it did... the only thing we can do at this point + * is replace the 8bit garbage and pray */ + register const char *inptr = text; + const char *inend = inptr + len; + char *outbuf = out; + + while (inptr < inend) { + if (is_ascii (*inptr)) + *outbuf++ = *inptr++; + else + *outbuf++ = '?'; + } + + *outbuf++ = '\0'; + + return g_realloc (out, (size_t) (outbuf - out)); + } + + outlen = charset_convert (cd, text, len, &out, &outleft, &ninval); + + g_mime_iconv_close (cd); + + return g_realloc (out, outlen + 1); +} + + +/* this decodes rfc2047's version of quoted-printable */ +static ssize_t +quoted_decode (const unsigned char *in, size_t len, unsigned char *out) +{ + register const unsigned char *inptr; + register unsigned char *outptr; + const unsigned char *inend; + unsigned char c, c1; + + inend = in + len; + outptr = out; + + inptr = in; + while (inptr < inend) { + c = *inptr++; + if (c == '=') { + if (inend - inptr >= 2) { + c = toupper (*inptr++); + c1 = toupper (*inptr++); + *outptr++ = (((c >= 'A' ? c - 'A' + 10 : c - '0') & 0x0f) << 4) + | ((c1 >= 'A' ? c1 - 'A' + 10 : c1 - '0') & 0x0f); + } else { + /* data was truncated */ + return -1; + } + } else if (c == '_') { + /* _'s are an rfc2047 shortcut for encoding spaces */ + *outptr++ = ' '; + } else { + *outptr++ = c; + } + } + + return (ssize_t) (outptr - out); +} + +#define is_rfc2047_encoded_word(atom, len) (len >= 7 && !strncmp (atom, "=?", 2) && !strncmp (atom + len - 2, "?=", 2)) + +static char * +rfc2047_decode_word (const char *in, size_t inlen) +{ + const unsigned char *instart = (const unsigned char *) in; + const register unsigned char *inptr = instart + 2; + const unsigned char *inend = instart + inlen - 2; + unsigned char *decoded; + const char *charset; + size_t len, ninval; + char *charenc, *p; + guint32 save = 0; + ssize_t declen; + int state = 0; + iconv_t cd; + char *buf; + + /* skip over the charset */ + if (!(inptr = memchr (inptr, '?', inend - inptr)) || inptr[2] != '?') + return NULL; + + inptr++; + + switch (*inptr) { + case 'B': + case 'b': + inptr += 2; + len = (size_t) (inend - inptr); + decoded = g_alloca (len); + declen = g_mime_encoding_base64_decode_step (inptr, len, decoded, &state, &save); + + if (declen == -1) { + d(fprintf (stderr, "encountered broken 'Q' encoding\n")); + return NULL; + } + break; + case 'Q': + case 'q': + inptr += 2; + len = (size_t) (inend - inptr); + decoded = g_alloca (len); + declen = quoted_decode (inptr, len, decoded); + + if (declen == -1) { + d(fprintf (stderr, "encountered broken 'Q' encoding\n")); + return NULL; + } + break; + default: + d(fprintf (stderr, "unknown encoding\n")); + return NULL; + } + + len = (inptr - 3) - (instart + 2); + charenc = g_alloca (len + 1); + memcpy (charenc, in + 2, len); + charenc[len] = '\0'; + charset = charenc; + + /* rfc2231 updates rfc2047 encoded words... + * The ABNF given in RFC 2047 for encoded-words is: + * encoded-word := "=?" charset "?" encoding "?" encoded-text "?=" + * This specification changes this ABNF to: + * encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?=" + */ + + /* trim off the 'language' part if it's there... */ + if ((p = strchr (charset, '*'))) + *p = '\0'; + + /* slight optimization? */ + if (!g_ascii_strcasecmp (charset, "UTF-8")) { + p = (char *) decoded; + len = declen; + + //while (!g_utf8_validate (p, len, (const char **) &p)) { + // len = declen - (p - (char *) decoded); + // *p = '?'; + //} + + return g_strndup ((char *) decoded, declen); + } + + if (!charset[0] || (cd = g_mime_iconv_open ("UTF-8", charset)) == (iconv_t) -1) { + w(g_warning ("Cannot convert from %s to UTF-8, header display may " + "be corrupt: %s", charset[0] ? charset : "unspecified charset", + g_strerror (errno))); + + return g_mime_utils_decode_8bit ((char *) decoded, declen); + } + + len = declen; + buf = g_malloc (len + 1); + + charset_convert (cd, (char *) decoded, declen, &buf, &len, &ninval); + + g_mime_iconv_close (cd); + +#if w(!)0 + if (ninval > 0) { + g_warning ("Failed to completely convert \"%.*s\" to UTF-8, display may be " + "corrupt: %s", declen, decoded, g_strerror (errno)); + } +#endif + + return buf; +} + + +/** + * g_mime_utils_header_decode_text: + * @text: header text to decode + * + * Decodes an rfc2047 encoded 'text' header. + * + * Note: See g_mime_set_user_charsets() for details on how charset + * conversion is handled for unencoded 8bit text and/or wrongly + * specified rfc2047 encoded-word tokens. + * + * Returns: a newly allocated UTF-8 string representing the the decoded + * header. + **/ +char * +g_mime_utils_header_decode_text (const char *text) +{ + gboolean enable_rfc2047_workarounds = _g_mime_enable_rfc2047_workarounds (); + register const char *inptr = text; + gboolean encoded = FALSE; + const char *lwsp, *word; + size_t nlwsp, n; + gboolean ascii; + char *decoded; + GString *out; + + if (text == NULL) + return g_strdup (""); + + out = g_string_sized_new (strlen (text) + 1); + + while (*inptr != '\0') { + lwsp = inptr; + while (is_lwsp (*inptr)) + inptr++; + + nlwsp = (size_t) (inptr - lwsp); + + if (*inptr != '\0') { + word = inptr; + ascii = TRUE; + + if (enable_rfc2047_workarounds) { + if (!strncmp (inptr, "=?", 2)) { + inptr += 2; + + /* skip past the charset (if one is even declared, sigh) */ + while (*inptr && *inptr != '?') { + ascii = ascii && is_ascii (*inptr); + inptr++; + } + + /* sanity check encoding type */ + if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?') + goto non_rfc2047; + + inptr += 3; + + /* find the end of the rfc2047 encoded word token */ + while (*inptr && strncmp (inptr, "?=", 2) != 0) { + ascii = ascii && is_ascii (*inptr); + inptr++; + } + + if (!strncmp (inptr, "?=", 2)) + inptr += 2; + } else { + non_rfc2047: + /* stop if we encounter a possible rfc2047 encoded + * token even if it's inside another word, sigh. */ + while (*inptr && !is_lwsp (*inptr) && + strncmp (inptr, "=?", 2) != 0) { + ascii = ascii && is_ascii (*inptr); + inptr++; + } + } + } else { + while (*inptr && !is_lwsp (*inptr)) { + ascii = ascii && is_ascii (*inptr); + inptr++; + } + } + + n = (size_t) (inptr - word); + if (is_rfc2047_encoded_word (word, n)) { + if ((decoded = rfc2047_decode_word (word, n))) { + /* rfc2047 states that you must ignore all + * whitespace between encoded words */ + if (!encoded) + g_string_append_len (out, lwsp, nlwsp); + + g_string_append (out, decoded); + g_free (decoded); + + encoded = TRUE; + } else { + /* append lwsp and invalid rfc2047 encoded-word token */ + g_string_append_len (out, lwsp, nlwsp + n); + encoded = FALSE; + } + } else { + /* append lwsp */ + g_string_append_len (out, lwsp, nlwsp); + + /* append word token */ + if (!ascii) { + /* *sigh* I hate broken mailers... */ + decoded = g_mime_utils_decode_8bit (word, n); + g_string_append (out, decoded); + g_free (decoded); + } else { + g_string_append_len (out, word, n); + } + + encoded = FALSE; + } + } else { + /* appending trailing lwsp */ + g_string_append_len (out, lwsp, nlwsp); + break; + } + } + + decoded = out->str; + g_string_free (out, FALSE); + + return decoded; +} + + +/** + * g_mime_utils_header_decode_phrase: + * @phrase: header to decode + * + * Decodes an rfc2047 encoded 'phrase' header. + * + * Note: See g_mime_set_user_charsets() for details on how charset + * conversion is handled for unencoded 8bit text and/or wrongly + * specified rfc2047 encoded-word tokens. + * + * Returns: a newly allocated UTF-8 string representing the the decoded + * header. + **/ +char * +g_mime_utils_header_decode_phrase (const char *phrase) +{ + register const char *inptr = phrase; + gboolean encoded = FALSE; + const char *lwsp, *text; + size_t nlwsp, n; + gboolean ascii; + char *decoded; + GString *out; + + if (phrase == NULL) + return g_strdup (""); + + out = g_string_sized_new (strlen (phrase) + 1); + + while (*inptr != '\0') { + lwsp = inptr; + while (is_lwsp (*inptr)) + inptr++; + + nlwsp = (size_t) (inptr - lwsp); + + text = inptr; + if (is_atom (*inptr)) { + while (is_atom (*inptr)) + inptr++; + + n = (size_t) (inptr - text); + if (is_rfc2047_encoded_word (text, n)) { + if ((decoded = rfc2047_decode_word (text, n))) { + /* rfc2047 states that you must ignore all + * whitespace between encoded words */ + if (!encoded) + g_string_append_len (out, lwsp, nlwsp); + + g_string_append (out, decoded); + g_free (decoded); + + encoded = TRUE; + } else { + /* append lwsp and invalid rfc2047 encoded-word token */ + g_string_append_len (out, lwsp, nlwsp + n); + encoded = FALSE; + } + } else { + /* append lwsp and atom token */ + g_string_append_len (out, lwsp, nlwsp + n); + encoded = FALSE; + } + } else { + g_string_append_len (out, lwsp, nlwsp); + + ascii = TRUE; + while (*inptr && !is_lwsp (*inptr)) { + ascii = ascii && is_ascii (*inptr); + inptr++; + } + + n = (size_t) (inptr - text); + + if (!ascii) { + /* *sigh* I hate broken mailers... */ + decoded = g_mime_utils_decode_8bit (text, n); + g_string_append (out, decoded); + g_free (decoded); + } else { + g_string_append_len (out, text, n); + } + + encoded = FALSE; + } + } + + decoded = out->str; + g_string_free (out, FALSE); + + return decoded; +} + + +/* rfc2047 version of quoted-printable */ +static size_t +quoted_encode (const char *in, size_t len, unsigned char *out, gushort safemask) +{ + register const unsigned char *inptr = (const unsigned char *) in; + const unsigned char *inend = inptr + len; + register unsigned char *outptr = out; + unsigned char c; + + while (inptr < inend) { + c = *inptr++; + if (c == ' ') { + *outptr++ = '_'; + } else if (c != '_' && gmime_special_table[c] & safemask) { + *outptr++ = c; + } else { + *outptr++ = '='; + *outptr++ = tohex[(c >> 4) & 0xf]; + *outptr++ = tohex[c & 0xf]; + } + } + + return (outptr - out); +} + +static void +rfc2047_encode_word (GString *string, const char *word, size_t len, + const char *charset, gushort safemask) +{ + register char *inptr, *outptr; + iconv_t cd = (iconv_t) -1; + unsigned char *encoded; + size_t enclen, pos; + char *uword = NULL; + guint32 save = 0; + int state = 0; + char encoding; + + if (g_ascii_strcasecmp (charset, "UTF-8") != 0) + cd = g_mime_iconv_open (charset, "UTF-8"); + + if (cd != (iconv_t) -1) { + uword = g_mime_iconv_strndup (cd, (char *) word, len); + g_mime_iconv_close (cd); + } + + if (uword) { + len = strlen (uword); + word = uword; + } else { + charset = "UTF-8"; + } + + switch (g_mime_utils_best_encoding ((const unsigned char *) word, len)) { + case GMIME_CONTENT_ENCODING_BASE64: + enclen = GMIME_BASE64_ENCODE_LEN (len); + encoded = g_alloca (enclen + 1); + + encoding = 'b'; + + pos = g_mime_encoding_base64_encode_close ((const unsigned char *) word, len, encoded, &state, &save); + encoded[pos] = '\0'; + + /* remove \n chars as headers need to be wrapped differently */ + if (G_UNLIKELY ((inptr = strchr ((char *) encoded, '\n')))) { + outptr = inptr++; + while (G_LIKELY (*inptr)) { + if (G_LIKELY (*inptr != '\n')) + *outptr++ = *inptr; + + inptr++; + } + + *outptr = '\0'; + } + + break; + case GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE: + enclen = GMIME_QP_ENCODE_LEN (len); + encoded = g_alloca (enclen + 1); + + encoding = 'q'; + + pos = quoted_encode (word, len, encoded, safemask); + encoded[pos] = '\0'; + + break; + default: + encoded = NULL; + encoding = '\0'; + g_assert_not_reached (); + } + + g_free (uword); + + g_string_append_printf (string, "=?%s?%c?%s?=", charset, encoding, encoded); +} + + +typedef enum { + WORD_ATOM, + WORD_QSTRING, + WORD_2047 +} rfc822_word_t; + +typedef struct _rfc822_word { + struct _rfc822_word *next; + const char *start, *end; + rfc822_word_t type; + int encoding; +} rfc822_word; + +#define rfc822_word_free(word) g_slice_free (rfc822_word, word) +#define rfc822_word_new() g_slice_new (rfc822_word) + +/* okay, so 'unstructured text' fields don't actually contain 'word' + * tokens, but we can group stuff similarly... */ +static rfc822_word * +rfc2047_encode_get_rfc822_words (const char *in, gboolean phrase) +{ + rfc822_word *words, *tail, *word; + rfc822_word_t type = WORD_ATOM; + const char *inptr, *start, *last; + int count = 0, encoding = 0; + + words = NULL; + tail = (rfc822_word *) &words; + + last = start = inptr = in; + while (inptr && *inptr) { + const char *newinptr; + gunichar c; + + newinptr = g_utf8_next_char (inptr); + c = g_utf8_get_char (inptr); + if (newinptr == NULL || !g_unichar_validate (c)) { + w(g_warning ("Invalid UTF-8 sequence encountered")); + inptr++; + continue; + } + + inptr = newinptr; + + if (c < 256 && is_lwsp (c)) { + if (count > 0) { + word = rfc822_word_new (); + word->next = NULL; + word->start = start; + word->end = last; + word->type = type; + word->encoding = encoding; + + tail->next = word; + tail = word; + count = 0; + } + + start = inptr; + type = WORD_ATOM; + encoding = 0; + } else { + count++; + if (phrase && c < 128) { + /* phrases can have qstring words */ + if (!is_atom (c)) + type = MAX (type, WORD_QSTRING); + } else if (c > 127 && c < 256) { + type = WORD_2047; + encoding = MAX (encoding, 1); + } else if (c >= 256) { + type = WORD_2047; + encoding = 2; + } + + if (count >= GMIME_FOLD_PREENCODED) { + word = rfc822_word_new (); + word->next = NULL; + word->start = start; + word->end = inptr; + word->type = type; + word->encoding = encoding; + + tail->next = word; + tail = word; + count = 0; + + /* Note: don't reset 'type' as it + * needs to be preserved when breaking + * long words */ + start = inptr; + encoding = 0; + } + } + + last = inptr; + } + + if (count > 0) { + word = rfc822_word_new (); + word->next = NULL; + word->start = start; + word->end = last; + word->type = type; + word->encoding = encoding; + + tail->next = word; + tail = word; + } + +#if d(!)0 + printf ("rfc822 word tokens:\n"); + word = words; + while (word) { + printf ("\t'%.*s'; type=%d, encoding=%d\n", + word->end - word->start, word->start, + word->type, word->encoding); + + word = word->next; + } +#endif + + return words; +} + +#define MERGED_WORD_LT_FOLDLEN(wlen, type) ((type) == WORD_2047 ? (wlen) < GMIME_FOLD_PREENCODED : (wlen) < (GMIME_FOLD_LEN - 8)) + +static gboolean +should_merge_words (rfc822_word *word, rfc822_word *next) +{ + switch (word->type) { + case WORD_ATOM: + if (next->type == WORD_2047) + return FALSE; + + return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, next->type)); + case WORD_QSTRING: + /* avoid merging with words that need to be rfc2047 encoded */ + if (next->type == WORD_2047) + return FALSE; + + return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, WORD_QSTRING)); + case WORD_2047: + if (next->type == WORD_ATOM) { + /* whether we merge or not is dependent upon: + * 1. the number of atoms in a row after 'word' + * 2. if there is another encword after the string of atoms. + */ + int natoms = 0; + + while (next && next->type == WORD_ATOM) { + next = next->next; + natoms++; + } + + /* if all the words after the encword are atoms, don't merge */ + if (!next || natoms > 3) + return FALSE; + } + + /* avoid merging with qstrings */ + if (next->type == WORD_QSTRING) + return FALSE; + + return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, WORD_2047)); + default: + return FALSE; + } +} + +static void +rfc2047_encode_merge_rfc822_words (rfc822_word **wordsp) +{ + rfc822_word *word, *next, *words = *wordsp; + + /* first pass: merge qstrings with adjacent qstrings and encwords with adjacent encwords */ + word = words; + while (word && word->next) { + next = word->next; + + if (word->type != WORD_ATOM && word->type == next->type && + MERGED_WORD_LT_FOLDLEN (next->end - word->start, word->type)) { + /* merge the words */ + word->encoding = MAX (word->encoding, next->encoding); + + word->end = next->end; + word->next = next->next; + + rfc822_word_free (next); + + next = word; + } + + word = next; + } + + /* second pass: now merge atoms with the other words */ + word = words; + while (word && word->next) { + next = word->next; + + if (should_merge_words (word, next)) { + /* the resulting word type is the MAX of the 2 types */ + word->type = MAX (word->type, next->type); + + word->encoding = MAX (word->encoding, next->encoding); + + word->end = next->end; + word->next = next->next; + + rfc822_word_free (next); + + continue; + } + + word = next; + } + + *wordsp = words; +} + +static void +g_string_append_len_quoted (GString *out, const char *in, size_t len) +{ + register const char *inptr; + const char *inend; + + g_string_append_c (out, '"'); + + inptr = in; + inend = in + len; + + while (inptr < inend) { + if (*inptr == '"' || *inptr == '\\') + g_string_append_c (out, '\\'); + + g_string_append_c (out, *inptr); + + inptr++; + } + + g_string_append_c (out, '"'); +} + +static char * +rfc2047_encode (const char *in, gushort safemask) +{ + rfc822_word *words, *word, *prev = NULL; + const char **charsets, *charset; + const char *start; + GMimeCharset mask; + GString *out; + char *outstr; + size_t len; + int i; + + if (!(words = rfc2047_encode_get_rfc822_words (in, safemask & IS_PSAFE))) + return g_strdup (in); + + rfc2047_encode_merge_rfc822_words (&words); + + charsets = g_mime_user_charsets (); + + out = g_string_new (""); + + /* output words now with spaces between them */ + word = words; + while (word) { + /* append correct number of spaces between words */ + if (prev && !(prev->type == WORD_2047 && word->type == WORD_2047)) { + /* one or both of the words are not encoded so we write the spaces out untouched */ + len = word->start - prev->end; + g_string_append_len (out, prev->end, len); + } + + switch (word->type) { + case WORD_ATOM: + g_string_append_len (out, word->start, (size_t) (word->end - word->start)); + break; + case WORD_QSTRING: + g_assert (safemask & IS_PSAFE); + g_string_append_len_quoted (out, word->start, (size_t) (word->end - word->start)); + break; + case WORD_2047: + if (prev && prev->type == WORD_2047) { + /* include the whitespace chars between these 2 words in the + resulting rfc2047 encoded word. */ + len = word->end - prev->end; + start = prev->end; + + /* encoded words need to be separated by linear whitespace */ + g_string_append_c (out, ' '); + } else { + len = word->end - word->start; + start = word->start; + } + + switch (word->encoding) { + case 0: /* us-ascii */ + rfc2047_encode_word (out, start, len, "us-ascii", safemask); + break; + case 1: /* iso-8859-1 */ + rfc2047_encode_word (out, start, len, "iso-8859-1", safemask); + break; + default: + charset = NULL; + g_mime_charset_init (&mask); + g_mime_charset_step (&mask, start, len); + + for (i = 0; charsets && charsets[i]; i++) { + if (g_mime_charset_can_encode (&mask, charsets[i], start, len)) { + charset = charsets[i]; + break; + } + } + + if (!charset) + charset = g_mime_charset_best_name (&mask); + + rfc2047_encode_word (out, start, len, charset, safemask); + break; + } + + break; + } + + rfc822_word_free (prev); + + prev = word; + word = word->next; + } + + rfc822_word_free (prev); + + outstr = out->str; + g_string_free (out, FALSE); + + return outstr; +} + + +/** + * g_mime_utils_header_encode_phrase: + * @phrase: phrase to encode + * + * Encodes a 'phrase' header according to the rules in rfc2047. + * + * Returns: the encoded 'phrase'. Useful for encoding internet + * addresses. + **/ +char * +g_mime_utils_header_encode_phrase (const char *phrase) +{ + if (phrase == NULL) + return NULL; + + return rfc2047_encode (phrase, IS_PSAFE); +} + + +/** + * g_mime_utils_header_encode_text: + * @text: text to encode + * + * Encodes a 'text' header according to the rules in rfc2047. + * + * Returns: the encoded header. Useful for encoding + * headers like "Subject". + **/ +char * +g_mime_utils_header_encode_text (const char *text) +{ + if (text == NULL) + return NULL; + + return rfc2047_encode (text, IS_ESAFE); +}