git.notmuchmail.org Git - notmuch/blob - date.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*  GMime
   3  *  Copyright (C) 2000-2009 Jeffrey Stedfast
   4  *
   5  *  This library is free software; you can redistribute it and/or
   6  *  modify it under the terms of the GNU Lesser General Public License
   7  *  as published by the Free Software Foundation; either version 2.1
   8  *  of the License, or (at your option) any later version.
   9  *
  10  *  This library is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  *  Lesser General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU Lesser General Public
  16  *  License along with this library; if not, write to the Free
  17  *  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  18  *  02110-1301, USA.
  19  */
  20
  21
  22 #ifdef HAVE_CONFIG_H
  23 #include <config.h>
  24 #endif
  25
  26 #define _GNU_SOURCE
  27
  28 #include <glib.h>
  29
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <string.h>
  33 #ifdef HAVE_SYS_PARAM_H
  34 #include <sys/param.h>      /* for MAXHOSTNAMELEN */
  35 #else
  36 #define MAXHOSTNAMELEN 64
  37 #endif
  38 #ifdef HAVE_UTSNAME_DOMAINNAME
  39 #include <sys/utsname.h>    /* for uname() */
  40 #endif
  41 #include <sys/types.h>
  42 #ifdef HAVE_UNISTD_H
  43 #include <unistd.h>         /* Unix header for getpid() */
  44 #endif
  45 #ifdef G_OS_WIN32
  46 #include <winsock2.h>
  47 #include <ws2tcpip.h>
  48 #include <process.h>
  49 #define getpid() _getpid()
  50 #endif
  51 #ifdef HAVE_NETDB_H
  52 #include <netdb.h>
  53 #endif
  54 #include <ctype.h>
  55 #include <errno.h>
  56
  57 #include "gmime-utils.h"
  58 #include "gmime-table-private.h"
  59 #include "gmime-parse-utils.h"
  60 #include "gmime-part.h"
  61 #include "gmime-charset.h"
  62 #include "gmime-iconv.h"
  63 #include "gmime-iconv-utils.h"
  64
  65 #ifdef ENABLE_WARNINGS
  66 #define w(x) x
  67 #else
  68 #define w(x)
  69 #endif /* ENABLE_WARNINGS */
  70
  71 #define d(x)
  72
  73
  74 /**
  75  * SECTION: gmime-utils
  76  * @title: gmime-utils
  77  * @short_description: MIME utility functions
  78  * @see_also:
  79  *
  80  * Utility functions to parse, encode and decode various MIME tokens
  81  * and encodings.
  82  **/
  83
  84 extern gboolean _g_mime_enable_rfc2047_workarounds (void);
  85
  86 #define GMIME_FOLD_PREENCODED  (GMIME_FOLD_LEN / 2)
  87
  88 /* date parser macros */
  89 #define NUMERIC_CHARS          "1234567890"
  90 #define WEEKDAY_CHARS          "SundayMondayTuesdayWednesdayThursdayFridaySaturday"
  91 #define MONTH_CHARS            "JanuaryFebruaryMarchAprilMayJuneJulyAugustSeptemberOctoberNovemberDecember"
  92 #define TIMEZONE_ALPHA_CHARS   "UTCGMTESTEDTCSTCDTMSTPSTPDTZAMNY()"
  93 #define TIMEZONE_NUMERIC_CHARS "-+1234567890"
  94 #define TIME_CHARS             "1234567890:"
  95
  96 #define DATE_TOKEN_NON_NUMERIC          (1 << 0)
  97 #define DATE_TOKEN_NON_WEEKDAY          (1 << 1)
  98 #define DATE_TOKEN_NON_MONTH            (1 << 2)
  99 #define DATE_TOKEN_NON_TIME             (1 << 3)
 100 #define DATE_TOKEN_HAS_COLON            (1 << 4)
 101 #define DATE_TOKEN_NON_TIMEZONE_ALPHA   (1 << 5)
 102 #define DATE_TOKEN_NON_TIMEZONE_NUMERIC (1 << 6)
 103 #define DATE_TOKEN_HAS_SIGN             (1 << 7)
 104
 105 static unsigned char tohex[16] = {
 106         '0', '1', '2', '3', '4', '5', '6', '7',
 107         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
 108 };
 109
 110 static unsigned char gmime_datetok_table[256] = {
 111         128,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 112         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 113         111,111,111,111,111,111,111,111, 79, 79,111,175,111,175,111,111,
 114          38, 38, 38, 38, 38, 38, 38, 38, 38, 38,119,111,111,111,111,111,
 115         111, 75,111, 79, 75, 79,105, 79,111,111,107,111,111, 73, 75,107,
 116          79,111,111, 73, 77, 79,111,109,111, 79, 79,111,111,111,111,111,
 117         111,105,107,107,109,105,111,107,105,105,111,111,107,107,105,105,
 118         107,111,105,105,105,105,107,111,111,105,111,111,111,111,111,111,
 119         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 120         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 121         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 122         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 123         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 124         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 125         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 126         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 127 };
 128
 129 /* hrm, is there a library for this shit? */
 130 static struct {
 131         char *name;
 132         int offset;
 133 } tz_offsets [] = {
 134         { "UT", 0 },
 135         { "GMT", 0 },
 136         { "EST", -500 },        /* these are all US timezones.  bloody yanks */
 137         { "EDT", -400 },
 138         { "CST", -600 },
 139         { "CDT", -500 },
 140         { "MST", -700 },
 141         { "MDT", -600 },
 142         { "PST", -800 },
 143         { "PDT", -700 },
 144         { "Z", 0 },
 145         { "A", -100 },
 146         { "M", -1200 },
 147         { "N", 100 },
 148         { "Y", 1200 },
 149 };
 150
 151 static char *tm_months[] = {
 152         "Jan", "Feb", "Mar", "Apr", "May", "Jun",
 153         "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
 154 };
 155
 156 static char *tm_days[] = {
 157         "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
 158 };
 159
 160
 161 /**
 162  * g_mime_utils_header_format_date:
 163  * @date: time_t date representation
 164  * @tz_offset: Timezone offset
 165  *
 166  * Allocates a string buffer containing the rfc822 formatted date
 167  * string represented by @time and @tz_offset.
 168  *
 169  * Returns: a valid string representation of the date.
 170  **/
 171 char *
 172 g_mime_utils_header_format_date (time_t date, int tz_offset)
 173 {
 174         struct tm tm;
 175
 176         date += ((tz_offset / 100) * (60 * 60)) + (tz_offset % 100) * 60;
 177
 178 #if defined (HAVE_GMTIME_R)
 179         gmtime_r (&date, &tm);
 180 #elif defined (HAVE_GMTIME_S)
 181         gmtime_s (&tm, &date);
 182 #else
 183         memcpy (&tm, gmtime (&date), sizeof (tm));
 184 #endif
 185
 186         return g_strdup_printf ("%s, %02d %s %04d %02d:%02d:%02d %+05d",
 187                                 tm_days[tm.tm_wday], tm.tm_mday,
 188                                 tm_months[tm.tm_mon],
 189                                 tm.tm_year + 1900,
 190                                 tm.tm_hour, tm.tm_min, tm.tm_sec,
 191                                 tz_offset);
 192 }
 193
 194 /* This is where it gets ugly... */
 195
 196 typedef struct _date_token {
 197         struct _date_token *next;
 198         unsigned char mask;
 199         const char *start;
 200         size_t len;
 201 } date_token;
 202
 203 #define date_token_free(tok) g_slice_free (date_token, tok)
 204 #define date_token_new() g_slice_new (date_token)
 205
 206 static date_token *
 207 datetok (const char *date)
 208 {
 209         date_token *tokens = NULL, *token, *tail = (date_token *) &tokens;
 210         const char *start, *end;
 211         unsigned char mask;
 212
 213         start = date;
 214         while (*start) {
 215                 /* kill leading whitespace */
 216                 while (*start == ' ' || *start == '\t')
 217                         start++;
 218
 219                 if (*start == '\0')
 220                         break;
 221
 222                 mask = gmime_datetok_table[(unsigned char) *start];
 223
 224                 /* find the end of this token */
 225                 end = start + 1;
 226                 while (*end && !strchr ("-/,\t\r\n ", *end))
 227                         mask |= gmime_datetok_table[(unsigned char) *end++];
 228
 229                 if (end != start) {
 230                         token = date_token_new ();
 231                         token->next = NULL;
 232                         token->start = start;
 233                         token->len = end - start;
 234                         token->mask = mask;
 235
 236                         tail->next = token;
 237                         tail = token;
 238                 }
 239
 240                 if (*end)
 241                         start = end + 1;
 242                 else
 243                         break;
 244         }
 245
 246         return tokens;
 247 }
 248
 249 static int
 250 decode_int (const char *in, size_t inlen)
 251 {
 252         register const char *inptr;
 253         int sign = 1, val = 0;
 254         const char *inend;
 255
 256         inptr = in;
 257         inend = in + inlen;
 258
 259         if (*inptr == '-') {
 260                 sign = -1;
 261                 inptr++;
 262         } else if (*inptr == '+')
 263                 inptr++;
 264
 265         for ( ; inptr < inend; inptr++) {
 266                 if (!(*inptr >= '0' && *inptr <= '9'))
 267                         return -1;
 268                 else
 269                         val = (val * 10) + (*inptr - '0');
 270         }
 271
 272         val *= sign;
 273
 274         return val;
 275 }
 276
 277 #if 0
 278 static int
 279 get_days_in_month (int month, int year)
 280 {
 281         switch (month) {
 282         case 1:
 283         case 3:
 284         case 5:
 285         case 7:
 286         case 8:
 287         case 10:
 288         case 12:
 289                 return 31;
 290         case 4:
 291         case 6:
 292         case 9:
 293         case 11:
 294                 return 30;
 295         case 2:
 296                 if (g_date_is_leap_year (year))
 297                         return 29;
 298                 else
 299                         return 28;
 300         default:
 301                 return 0;
 302         }
 303 }
 304 #endif
 305
 306 static int
 307 get_wday (const char *in, size_t inlen)
 308 {
 309         int wday;
 310
 311         g_return_val_if_fail (in != NULL, -1);
 312
 313         if (inlen < 3)
 314                 return -1;
 315
 316         for (wday = 0; wday < 7; wday++) {
 317                 if (!g_ascii_strncasecmp (in, tm_days[wday], 3))
 318                         return wday;
 319         }
 320
 321         return -1;  /* unknown week day */
 322 }
 323
 324 static int
 325 get_mday (const char *in, size_t inlen)
 326 {
 327         int mday;
 328
 329         g_return_val_if_fail (in != NULL, -1);
 330
 331         mday = decode_int (in, inlen);
 332
 333         if (mday < 0 || mday > 31)
 334                 mday = -1;
 335
 336         return mday;
 337 }
 338
 339 static int
 340 get_month (const char *in, size_t inlen)
 341 {
 342         int i;
 343
 344         g_return_val_if_fail (in != NULL, -1);
 345
 346         if (inlen < 3)
 347                 return -1;
 348
 349         for (i = 0; i < 12; i++) {
 350                 if (!g_ascii_strncasecmp (in, tm_months[i], 3))
 351                         return i;
 352         }
 353
 354         return -1;  /* unknown month */
 355 }
 356
 357 static int
 358 get_year (const char *in, size_t inlen)
 359 {
 360         int year;
 361
 362         g_return_val_if_fail (in != NULL, -1);
 363
 364         if ((year = decode_int (in, inlen)) == -1)
 365                 return -1;
 366
 367         if (year < 100)
 368                 year += (year < 70) ? 2000 : 1900;
 369
 370         if (year < 1969)
 371                 return -1;
 372
 373         return year;
 374 }
 375
 376 static gboolean
 377 get_time (const char *in, size_t inlen, int *hour, int *min, int *sec)
 378 {
 379         register const char *inptr;
 380         int *val, colons = 0;
 381         const char *inend;
 382
 383         *hour = *min = *sec = 0;
 384
 385         inend = in + inlen;
 386         val = hour;
 387         for (inptr = in; inptr < inend; inptr++) {
 388                 if (*inptr == ':') {
 389                         colons++;
 390                         switch (colons) {
 391                         case 1:
 392                                 val = min;
 393                                 break;
 394                         case 2:
 395                                 val = sec;
 396                                 break;
 397                         default:
 398                                 return FALSE;
 399                         }
 400                 } else if (!(*inptr >= '0' && *inptr <= '9'))
 401                         return FALSE;
 402                 else
 403                         *val = (*val * 10) + (*inptr - '0');
 404         }
 405
 406         return TRUE;
 407 }
 408
 409 static int
 410 get_tzone (date_token **token)
 411 {
 412         const char *inptr, *inend;
 413         size_t inlen;
 414         int i, t;
 415
 416         for (i = 0; *token && i < 2; *token = (*token)->next, i++) {
 417                 inptr = (*token)->start;
 418                 inlen = (*token)->len;
 419                 inend = inptr + inlen;
 420
 421                 if (*inptr == '+' || *inptr == '-') {
 422                         return decode_int (inptr, inlen);
 423                 } else {
 424                         if (*inptr == '(') {
 425                                 inptr++;
 426                                 if (*(inend - 1) == ')')
 427                                         inlen -= 2;
 428                                 else
 429                                         inlen--;
 430                         }
 431
 432                         for (t = 0; t < 15; t++) {
 433                                 size_t len = strlen (tz_offsets[t].name);
 434
 435                                 if (len != inlen)
 436                                         continue;
 437
 438                                 if (!strncmp (inptr, tz_offsets[t].name, len))
 439                                         return tz_offsets[t].offset;
 440                         }
 441                 }
 442         }
 443
 444         return -1;
 445 }
 446
 447 static time_t
 448 mktime_utc (struct tm *tm)
 449 {
 450         time_t tt;
 451         long tz;
 452
 453         tm->tm_isdst = -1;
 454         tt = mktime (tm);
 455
 456 #if defined (G_OS_WIN32)
 457         _get_timezone (&tz);
 458         if (tm->tm_isdst > 0) {
 459                 int dst;
 460
 461                 _get_dstbias (&dst);
 462                 tz += dst;
 463         }
 464 #elif defined (HAVE_TM_GMTOFF)
 465         tz = -tm->tm_gmtoff;
 466 #elif defined (HAVE_TIMEZONE)
 467         if (tm->tm_isdst > 0) {
 468 #if defined (HAVE_ALTZONE)
 469                 tz = altzone;
 470 #else /* !defined (HAVE_ALTZONE) */
 471                 tz = (timezone - 3600);
 472 #endif
 473         } else {
 474                 tz = timezone;
 475         }
 476 #elif defined (HAVE__TIMEZONE)
 477         tz = _timezone;
 478 #else
 479 #error Neither HAVE_TIMEZONE nor HAVE_TM_GMTOFF defined. Rerun autoheader, autoconf, etc.
 480 #endif
 481
 482         return tt - tz;
 483 }
 484
 485 static time_t
 486 parse_rfc822_date (date_token *tokens, int *tzone)
 487 {
 488         int hour, min, sec, offset, n;
 489         date_token *token;
 490         struct tm tm;
 491         time_t t;
 492
 493         g_return_val_if_fail (tokens != NULL, (time_t) 0);
 494
 495         token = tokens;
 496
 497         memset ((void *) &tm, 0, sizeof (struct tm));
 498
 499         if ((n = get_wday (token->start, token->len)) != -1) {
 500                 /* not all dates may have this... */
 501                 tm.tm_wday = n;
 502                 token = token->next;
 503         }
 504
 505         /* get the mday */
 506         if (!token || (n = get_mday (token->start, token->len)) == -1)
 507                 return (time_t) 0;
 508
 509         tm.tm_mday = n;
 510         token = token->next;
 511
 512         /* get the month */
 513         if (!token || (n = get_month (token->start, token->len)) == -1)
 514                 return (time_t) 0;
 515
 516         tm.tm_mon = n;
 517         token = token->next;
 518
 519         /* get the year */
 520         if (!token || (n = get_year (token->start, token->len)) == -1)
 521                 return (time_t) 0;
 522
 523         tm.tm_year = n - 1900;
 524         token = token->next;
 525
 526         /* get the hour/min/sec */
 527         if (!token || !get_time (token->start, token->len, &hour, &min, &sec))
 528                 return (time_t) 0;
 529
 530         tm.tm_hour = hour;
 531         tm.tm_min = min;
 532         tm.tm_sec = sec;
 533         token = token->next;
 534
 535         /* get the timezone */
 536         if (!token || (n = get_tzone (&token)) == -1) {
 537                 /* I guess we assume tz is GMT? */
 538                 offset = 0;
 539         } else {
 540                 offset = n;
 541         }
 542
 543         t = mktime_utc (&tm);
 544
 545         /* t is now GMT of the time we want, but not offset by the timezone ... */
 546
 547         /* this should convert the time to the GMT equiv time */
 548         t -= ((offset / 100) * 60 * 60) + (offset % 100) * 60;
 549
 550         if (tzone)
 551                 *tzone = offset;
 552
 553         return t;
 554 }
 555
 556
 557 #define date_token_mask(t)  (((date_token *) t)->mask)
 558 #define is_numeric(t)       ((date_token_mask (t) & DATE_TOKEN_NON_NUMERIC) == 0)
 559 #define is_weekday(t)       ((date_token_mask (t) & DATE_TOKEN_NON_WEEKDAY) == 0)
 560 #define is_month(t)         ((date_token_mask (t) & DATE_TOKEN_NON_MONTH) == 0)
 561 #define is_time(t)          (((date_token_mask (t) & DATE_TOKEN_NON_TIME) == 0) && (date_token_mask (t) & DATE_TOKEN_HAS_COLON))
 562 #define is_tzone_alpha(t)   ((date_token_mask (t) & DATE_TOKEN_NON_TIMEZONE_ALPHA) == 0)
 563 #define is_tzone_numeric(t) (((date_token_mask (t) & DATE_TOKEN_NON_TIMEZONE_NUMERIC) == 0) && (date_token_mask (t) & DATE_TOKEN_HAS_SIGN))
 564 #define is_tzone(t)         (is_tzone_alpha (t) || is_tzone_numeric (t))
 565
 566 static time_t
 567 parse_broken_date (date_token *tokens, int *tzone)
 568 {
 569         gboolean got_wday, got_month, got_tzone;
 570         int hour, min, sec, offset, n;
 571         date_token *token;
 572         struct tm tm;
 573         time_t t;
 574
 575         memset ((void *) &tm, 0, sizeof (struct tm));
 576         got_wday = got_month = got_tzone = FALSE;
 577         offset = 0;
 578
 579         token = tokens;
 580         while (token) {
 581                 if (is_weekday (token) && !got_wday) {
 582                         if ((n = get_wday (token->start, token->len)) != -1) {
 583                                 d(printf ("weekday; "));
 584                                 got_wday = TRUE;
 585                                 tm.tm_wday = n;
 586                                 goto next;
 587                         }
 588                 }
 589
 590                 if (is_month (token) && !got_month) {
 591                         if ((n = get_month (token->start, token->len)) != -1) {
 592                                 d(printf ("month; "));
 593                                 got_month = TRUE;
 594                                 tm.tm_mon = n;
 595                                 goto next;
 596                         }
 597                 }
 598
 599                 if (is_time (token) && !tm.tm_hour && !tm.tm_min && !tm.tm_sec) {
 600                         if (get_time (token->start, token->len, &hour, &min, &sec)) {
 601                                 d(printf ("time; "));
 602                                 tm.tm_hour = hour;
 603                                 tm.tm_min = min;
 604                                 tm.tm_sec = sec;
 605                                 goto next;
 606                         }
 607                 }
 608
 609                 if (is_tzone (token) && !got_tzone) {
 610                         date_token *t = token;
 611
 612                         if ((n = get_tzone (&t)) != -1) {
 613                                 d(printf ("tzone; "));
 614                                 got_tzone = TRUE;
 615                                 offset = n;
 616                                 goto next;
 617                         }
 618                 }
 619
 620                 if (is_numeric (token)) {
 621                         if (token->len == 4 && !tm.tm_year) {
 622                                 if ((n = get_year (token->start, token->len)) != -1) {
 623                                         d(printf ("year; "));
 624                                         tm.tm_year = n - 1900;
 625                                         goto next;
 626                                 }
 627                         } else {
 628                                 /* Note: assumes MM-DD-YY ordering if '0 < MM < 12' holds true */
 629                                 if (!got_month && token->next && is_numeric (token->next)) {
 630                                         if ((n = decode_int (token->start, token->len)) > 12) {
 631                                                 goto mday;
 632                                         } else if (n > 0) {
 633                                                 d(printf ("mon; "));
 634                                                 got_month = TRUE;
 635                                                 tm.tm_mon = n - 1;
 636                                         }
 637                                         goto next;
 638                                 } else if (!tm.tm_mday && (n = get_mday (token->start, token->len)) != -1) {
 639                                 mday:
 640                                         d(printf ("mday; "));
 641                                         tm.tm_mday = n;
 642                                         goto next;
 643                                 } else if (!tm.tm_year) {
 644                                         if ((n = get_year (token->start, token->len)) != -1) {
 645                                                 d(printf ("2-digit year; "));
 646                                                 tm.tm_year = n - 1900;
 647                                         }
 648                                         goto next;
 649                                 }
 650                         }
 651                 }
 652
 653                 d(printf ("???; "));
 654
 655         next:
 656
 657                 token = token->next;
 658         }
 659
 660         d(printf ("\n"));
 661
 662         t = mktime_utc (&tm);
 663
 664         /* t is now GMT of the time we want, but not offset by the timezone ... */
 665
 666         /* this should convert the time to the GMT equiv time */
 667         t -= ((offset / 100) * 60 * 60) + (offset % 100) * 60;
 668
 669         if (tzone)
 670                 *tzone = offset;
 671
 672         return t;
 673 }
 674
 675 #if 0
 676 static void
 677 gmime_datetok_table_init (void)
 678 {
 679         int i;
 680
 681         memset (gmime_datetok_table, 0, sizeof (gmime_datetok_table));
 682
 683         for (i = 0; i < 256; i++) {
 684                 if (!strchr (NUMERIC_CHARS, i))
 685                         gmime_datetok_table[i] |= DATE_TOKEN_NON_NUMERIC;
 686
 687                 if (!strchr (WEEKDAY_CHARS, i))
 688                         gmime_datetok_table[i] |= DATE_TOKEN_NON_WEEKDAY;
 689
 690                 if (!strchr (MONTH_CHARS, i))
 691                         gmime_datetok_table[i] |= DATE_TOKEN_NON_MONTH;
 692
 693                 if (!strchr (TIME_CHARS, i))
 694                         gmime_datetok_table[i] |= DATE_TOKEN_NON_TIME;
 695
 696                 if (!strchr (TIMEZONE_ALPHA_CHARS, i))
 697                         gmime_datetok_table[i] |= DATE_TOKEN_NON_TIMEZONE_ALPHA;
 698
 699                 if (!strchr (TIMEZONE_NUMERIC_CHARS, i))
 700                         gmime_datetok_table[i] |= DATE_TOKEN_NON_TIMEZONE_NUMERIC;
 701
 702                 if (((char) i) == ':')
 703                         gmime_datetok_table[i] |= DATE_TOKEN_HAS_COLON;
 704
 705                 if (strchr ("+-", i))
 706                         gmime_datetok_table[i] |= DATE_TOKEN_HAS_SIGN;
 707         }
 708
 709         printf ("static unsigned char gmime_datetok_table[256] = {");
 710         for (i = 0; i < 256; i++) {
 711                 if (i % 16 == 0)
 712                         printf ("\n\t");
 713                 printf ("%3d,", gmime_datetok_table[i]);
 714         }
 715         printf ("\n};\n");
 716 }
 717 #endif
 718
 719
 720 /**
 721  * g_mime_utils_header_decode_date:
 722  * @str: input date string
 723  * @tz_offset: timezone offset
 724  *
 725  * Decodes the rfc822 date string and saves the GMT offset into
 726  * @tz_offset if non-NULL.
 727  *
 728  * Returns: the time_t representation of the date string specified by
 729  * @str or (time_t) %0 on error. If @tz_offset is non-NULL, the value
 730  * of the timezone offset will be stored.
 731  **/
 732 time_t
 733 g_mime_utils_header_decode_date (const char *str, int *tz_offset)
 734 {
 735         date_token *token, *tokens;
 736         time_t date;
 737
 738         if (!(tokens = datetok (str))) {
 739                 if (tz_offset)
 740                         *tz_offset = 0;
 741
 742                 return (time_t) 0;
 743         }
 744
 745         if (!(date = parse_rfc822_date (tokens, tz_offset)))
 746                 date = parse_broken_date (tokens, tz_offset);
 747
 748         /* cleanup */
 749         while (tokens) {
 750                 token = tokens;
 751                 tokens = tokens->next;
 752                 date_token_free (token);
 753         }
 754
 755         return date;
 756 }
 757
 758
 759 /**
 760  * g_mime_utils_generate_message_id:
 761  * @fqdn: Fully qualified domain name
 762  *
 763  * Generates a unique Message-Id.
 764  *
 765  * Returns: a unique string in an addr-spec format suitable for use as
 766  * a Message-Id.
 767  **/
 768 char *
 769 g_mime_utils_generate_message_id (const char *fqdn)
 770 {
 771 #ifdef G_THREADS_ENABLED
 772         static GStaticMutex mutex = G_STATIC_MUTEX_INIT;
 773 #define MUTEX_LOCK()   g_static_mutex_lock (&mutex)
 774 #define MUTEX_UNLOCK() g_static_mutex_unlock (&mutex)
 775 #else
 776 #define MUTEX_LOCK()
 777 #define MUTEX_UNLOCK()
 778 #endif
 779         static unsigned long int count = 0;
 780         const char *hostname = NULL;
 781         char *name = NULL;
 782         char *msgid;
 783
 784         if (!fqdn) {
 785 #ifdef HAVE_UTSNAME_DOMAINNAME
 786                 struct utsname unam;
 787
 788                 uname (&unam);
 789
 790                 hostname = unam.nodename;
 791
 792                 if (unam.domainname[0])
 793                         name = g_strdup_printf ("%s.%s", hostname, unam.domainname);
 794 #else /* ! HAVE_UTSNAME_DOMAINNAME */
 795                 char host[MAXHOSTNAMELEN + 1];
 796
 797 #ifdef HAVE_GETHOSTNAME
 798                 host[MAXHOSTNAMELEN] = '\0';
 799                 if (gethostname (host, MAXHOSTNAMELEN) == 0) {
 800 #ifdef HAVE_GETDOMAINNAME
 801                         size_t domainlen = MAXHOSTNAMELEN;
 802                         char *domain;
 803                         int rv;
 804
 805                         domain = g_malloc (domainlen);
 806
 807                         while ((rv = getdomainname (domain, domainlen)) == -1 && errno == EINVAL) {
 808                                 domainlen += MAXHOSTNAMELEN;
 809                                 domain = g_realloc (domain, domainlen);
 810                         }
 811
 812                         if (rv == 0 && domain[0]) {
 813                                 if (host[0]) {
 814                                         name = g_strdup_printf ("%s.%s", host, domain);
 815                                         g_free (domain);
 816                                 } else {
 817                                         name = domain;
 818                                 }
 819                         }
 820 #endif /* HAVE_GETDOMAINNAME */
 821                 } else {
 822                         host[0] = '\0';
 823                 }
 824 #endif /* HAVE_GETHOSTNAME */
 825                 hostname = host;
 826 #endif /* HAVE_UTSNAME_DOMAINNAME */
 827
 828 #ifdef HAVE_GETADDRINFO
 829                 if (!name && hostname[0]) {
 830                         /* we weren't able to get a domain name */
 831                         struct addrinfo hints, *res;
 832
 833                         memset (&hints, 0, sizeof (hints));
 834                         hints.ai_flags = AI_CANONNAME;
 835
 836                         if (getaddrinfo (hostname, NULL, &hints, &res) == 0) {
 837                                 name = g_strdup (res->ai_canonname);
 838                                 freeaddrinfo (res);
 839                         }
 840                 }
 841 #endif /* HAVE_GETADDRINFO */
 842
 843                 fqdn = name != NULL ? name : (hostname[0] ? hostname : "localhost.localdomain");
 844         }
 845
 846         MUTEX_LOCK ();
 847         msgid = g_strdup_printf ("%lu.%lu.%lu@%s", (unsigned long int) time (NULL),
 848                                  (unsigned long int) getpid (), count++, fqdn);
 849         MUTEX_UNLOCK ();
 850
 851         g_free (name);
 852
 853         return msgid;
 854 }
 855
 856 static char *
 857 decode_addrspec (const char **in)
 858 {
 859         const char *word, *inptr;
 860         GString *addrspec;
 861         char *str;
 862
 863         decode_lwsp (in);
 864         inptr = *in;
 865
 866         if (!(word = decode_word (&inptr))) {
 867                 w(g_warning ("No local-part in addr-spec: %s", *in));
 868                 return NULL;
 869         }
 870
 871         addrspec = g_string_new ("");
 872         g_string_append_len (addrspec, word, (size_t) (inptr - word));
 873
 874         /* get the rest of the local-part */
 875         decode_lwsp (&inptr);
 876         while (*inptr == '.') {
 877                 g_string_append_c (addrspec, *inptr++);
 878                 if ((word = decode_word (&inptr))) {
 879                         g_string_append_len (addrspec, word, (size_t) (inptr - word));
 880                         decode_lwsp (&inptr);
 881                 } else {
 882                         w(g_warning ("Invalid local-part in addr-spec: %s", *in));
 883                         goto exception;
 884                 }
 885         }
 886
 887         /* we should be at the '@' now... */
 888         if (*inptr++ != '@') {
 889                 w(g_warning ("Invalid addr-spec; missing '@': %s", *in));
 890                 goto exception;
 891         }
 892
 893         g_string_append_c (addrspec, '@');
 894         if (!decode_domain (&inptr, addrspec)) {
 895                 w(g_warning ("No domain in addr-spec: %s", *in));
 896                 goto exception;
 897         }
 898
 899         str = addrspec->str;
 900         g_string_free (addrspec, FALSE);
 901
 902         *in = inptr;
 903
 904         return str;
 905
 906  exception:
 907
 908         g_string_free (addrspec, TRUE);
 909
 910         return NULL;
 911 }
 912
 913 static char *
 914 decode_msgid (const char **in)
 915 {
 916         const char *inptr = *in;
 917         char *msgid = NULL;
 918
 919         decode_lwsp (&inptr);
 920         if (*inptr != '<') {
 921                 w(g_warning ("Invalid msg-id; missing '<': %s", *in));
 922         } else {
 923                 inptr++;
 924         }
 925
 926         decode_lwsp (&inptr);
 927         if ((msgid = decode_addrspec (&inptr))) {
 928                 decode_lwsp (&inptr);
 929                 if (*inptr != '>') {
 930                         w(g_warning ("Invalid msg-id; missing '>': %s", *in));
 931                 } else {
 932                         inptr++;
 933                 }
 934
 935                 *in = inptr;
 936         } else {
 937                 w(g_warning ("Invalid msg-id; missing addr-spec: %s", *in));
 938                 *in = inptr;
 939                 while (*inptr && *inptr != '>')
 940                         inptr++;
 941
 942                 msgid = g_strndup (*in, (size_t) (inptr - *in));
 943                 *in = inptr;
 944         }
 945
 946         return msgid;
 947 }
 948
 949
 950 /**
 951  * g_mime_utils_decode_message_id:
 952  * @message_id: string containing a message-id
 953  *
 954  * Decodes a msg-id as defined by rfc822.
 955  *
 956  * Returns: the addr-spec portion of the msg-id.
 957  **/
 958 char *
 959 g_mime_utils_decode_message_id (const char *message_id)
 960 {
 961         g_return_val_if_fail (message_id != NULL, NULL);
 962
 963         return decode_msgid (&message_id);
 964 }
 965
 966
 967 /**
 968  * g_mime_references_decode:
 969  * @text: string containing a list of msg-ids
 970  *
 971  * Decodes a list of msg-ids as in the References and/or In-Reply-To
 972  * headers defined in rfc822.
 973  *
 974  * Returns: a list of referenced msg-ids.
 975  **/
 976 GMimeReferences *
 977 g_mime_references_decode (const char *text)
 978 {
 979         GMimeReferences *refs, *tail, *ref;
 980         const char *word, *inptr = text;
 981         char *msgid;
 982
 983         g_return_val_if_fail (text != NULL, NULL);
 984
 985         refs = NULL;
 986         tail = (GMimeReferences *) &refs;
 987
 988         while (*inptr) {
 989                 decode_lwsp (&inptr);
 990                 if (*inptr == '<') {
 991                         /* looks like a msg-id */
 992                         if ((msgid = decode_msgid (&inptr))) {
 993                                 ref = g_new (GMimeReferences, 1);
 994                                 ref->next = NULL;
 995                                 ref->msgid = msgid;
 996                                 tail->next = ref;
 997                                 tail = ref;
 998                         } else {
 999                                 w(g_warning ("Invalid References header: %s", inptr));
1000                                 break;
1001                         }
1002                 } else if (*inptr) {
1003                         /* looks like part of a phrase */
1004                         if (!(word = decode_word (&inptr))) {
1005                                 w(g_warning ("Invalid References header: %s", inptr));
1006                                 break;
1007                         }
1008                 }
1009         }
1010
1011         return refs;
1012 }
1013
1014
1015 /**
1016  * g_mime_references_append:
1017  * @refs: the address of a #GMimeReferences list
1018  * @msgid: a message-id string
1019  *
1020  * Appends a reference to msgid to the list of references.
1021  **/
1022 void
1023 g_mime_references_append (GMimeReferences **refs, const char *msgid)
1024 {
1025         GMimeReferences *ref;
1026
1027         g_return_if_fail (refs != NULL);
1028         g_return_if_fail (msgid != NULL);
1029
1030         ref = (GMimeReferences *) refs;
1031         while (ref->next)
1032                 ref = ref->next;
1033
1034         ref->next = g_new (GMimeReferences, 1);
1035         ref->next->msgid = g_strdup (msgid);
1036         ref->next->next = NULL;
1037 }
1038
1039
1040 /**
1041  * g_mime_references_free:
1042  * @refs: a #GMimeReferences list
1043  *
1044  * Frees the #GMimeReferences list.
1045  **/
1046 void
1047 g_mime_references_free (GMimeReferences *refs)
1048 {
1049         GMimeReferences *ref, *next;
1050
1051         ref = refs;
1052         while (ref) {
1053                 next = ref->next;
1054                 g_free (ref->msgid);
1055                 g_free (ref);
1056                 ref = next;
1057         }
1058 }
1059
1060
1061 /**
1062  * g_mime_references_clear:
1063  * @refs: address of a #GMimeReferences list
1064  *
1065  * Clears the #GMimeReferences list and resets it to %NULL.
1066  **/
1067 void
1068 g_mime_references_clear (GMimeReferences **refs)
1069 {
1070         g_return_if_fail (refs != NULL);
1071
1072         g_mime_references_free (*refs);
1073         *refs = NULL;
1074 }
1075
1076
1077 /**
1078  * g_mime_references_get_next:
1079  * @ref: a #GMimeReferences list
1080  *
1081  * Advances to the next reference node in the #GMimeReferences list.
1082  *
1083  * Returns: the next reference node in the #GMimeReferences list.
1084  **/
1085 const GMimeReferences *
1086 g_mime_references_get_next (const GMimeReferences *ref)
1087 {
1088         return ref ? ref->next : NULL;
1089 }
1090
1091
1092 /**
1093  * g_mime_references_get_message_id:
1094  * @ref: a #GMimeReferences list
1095  *
1096  * Gets the Message-Id reference from the #GMimeReferences node.
1097  *
1098  * Returns: the Message-Id reference from the #GMimeReferences node.
1099  **/
1100 const char *
1101 g_mime_references_get_message_id (const GMimeReferences *ref)
1102 {
1103         return ref ? ref->msgid : NULL;
1104 }
1105
1106
1107 static gboolean
1108 is_rfc2047_token (const char *inptr, size_t len)
1109 {
1110         if (len < 8 || strncmp (inptr, "=?", 2) != 0 || strncmp (inptr + len - 2, "?=", 2) != 0)
1111                 return FALSE;
1112
1113         inptr += 2;
1114         len -= 2;
1115
1116         /* skip past the charset */
1117         while (*inptr != '?' && len > 0) {
1118                 inptr++;
1119                 len--;
1120         }
1121
1122         if (*inptr != '?' || len < 4)
1123                 return FALSE;
1124
1125         if (inptr[1] != 'q' && inptr[1] != 'Q' && inptr[1] != 'b' && inptr[1] != 'B')
1126                 return FALSE;
1127
1128         inptr += 2;
1129         len -= 2;
1130
1131         if (*inptr != '?')
1132                 return FALSE;
1133
1134         return TRUE;
1135 }
1136
1137 static char *
1138 header_fold (const char *in, gboolean structured)
1139 {
1140         gboolean last_was_lwsp = FALSE;
1141         register const char *inptr;
1142         size_t len, outlen, i;
1143         size_t fieldlen;
1144         GString *out;
1145         char *ret;
1146
1147         inptr = in;
1148         len = strlen (in);
1149         if (len <= GMIME_FOLD_LEN + 1)
1150                 return g_strdup (in);
1151
1152         out = g_string_new ("");
1153         fieldlen = strcspn (inptr, ": \t\n");
1154         g_string_append_len (out, inptr, fieldlen);
1155         outlen = fieldlen;
1156         inptr += fieldlen;
1157
1158         while (*inptr && *inptr != '\n') {
1159                 len = strcspn (inptr, " \t\n");
1160
1161                 if (len > 1 && outlen + len > GMIME_FOLD_LEN) {
1162                         if (outlen > 1 && out->len > fieldlen + 2) {
1163                                 if (last_was_lwsp) {
1164                                         if (structured)
1165                                                 out->str[out->len - 1] = '\t';
1166
1167                                         g_string_insert_c (out, out->len - 1, '\n');
1168                                 } else
1169                                         g_string_append (out, "\n\t");
1170                                 outlen = 1;
1171                         }
1172
1173                         if (!structured && !is_rfc2047_token (inptr, len)) {
1174                                 /* check for very long words, just cut them up */
1175                                 while (outlen + len > GMIME_FOLD_LEN) {
1176                                         for (i = 0; i < GMIME_FOLD_LEN - outlen; i++)
1177                                                 g_string_append_c (out, inptr[i]);
1178                                         inptr += GMIME_FOLD_LEN - outlen;
1179                                         len -= GMIME_FOLD_LEN - outlen;
1180                                         g_string_append (out, "\n\t");
1181                                         outlen = 1;
1182                                 }
1183                         } else {
1184                                 g_string_append_len (out, inptr, len);
1185                                 outlen += len;
1186                                 inptr += len;
1187                         }
1188                         last_was_lwsp = FALSE;
1189                 } else if (len > 0) {
1190                         g_string_append_len (out, inptr, len);
1191                         outlen += len;
1192                         inptr += len;
1193                         last_was_lwsp = FALSE;
1194                 } else {
1195                         last_was_lwsp = TRUE;
1196                         if (*inptr == '\t') {
1197                                 /* tabs are a good place to fold, odds
1198                                    are that this is where the previous
1199                                    mailer folded it */
1200                                 g_string_append (out, "\n\t");
1201                                 outlen = 1;
1202                                 while (is_blank (*inptr))
1203                                         inptr++;
1204                         } else {
1205                                 g_string_append_c (out, *inptr++);
1206                                 outlen++;
1207                         }
1208                 }
1209         }
1210
1211         if (*inptr == '\n' && out->str[out->len - 1] != '\n')
1212                 g_string_append_c (out, '\n');
1213
1214         ret = out->str;
1215         g_string_free (out, FALSE);
1216
1217         return ret;
1218 }
1219
1220
1221 /**
1222  * g_mime_utils_structured_header_fold:
1223  * @str: input string
1224  *
1225  * Folds a structured header according to the rules in rfc822.
1226  *
1227  * Returns: an allocated string containing the folded header.
1228  **/
1229 char *
1230 g_mime_utils_structured_header_fold (const char *str)
1231 {
1232         return header_fold (str, TRUE);
1233 }
1234
1235
1236 /**
1237  * g_mime_utils_unstructured_header_fold:
1238  * @str: input string
1239  *
1240  * Folds an unstructured header according to the rules in rfc822.
1241  *
1242  * Returns: an allocated string containing the folded header.
1243  **/
1244 char *
1245 g_mime_utils_unstructured_header_fold (const char *str)
1246 {
1247         return header_fold (str, FALSE);
1248 }
1249
1250
1251 /**
1252  * g_mime_utils_header_fold:
1253  * @str: input string
1254  *
1255  * Folds a structured header according to the rules in rfc822.
1256  *
1257  * Returns: an allocated string containing the folded header.
1258  **/
1259 char *
1260 g_mime_utils_header_fold (const char *str)
1261 {
1262         return header_fold (str, TRUE);
1263 }
1264
1265
1266 /**
1267  * g_mime_utils_header_printf:
1268  * @format: string format
1269  * @Varargs: arguments
1270  *
1271  * Allocates a buffer containing a formatted header specified by the
1272  * @Varargs.
1273  *
1274  * Returns: an allocated string containing the folded header specified
1275  * by @format and the following arguments.
1276  **/
1277 char *
1278 g_mime_utils_header_printf (const char *format, ...)
1279 {
1280         char *buf, *ret;
1281         va_list ap;
1282
1283         va_start (ap, format);
1284         buf = g_strdup_vprintf (format, ap);
1285         va_end (ap);
1286
1287         ret = header_fold (buf, TRUE);
1288         g_free (buf);
1289
1290         return ret;
1291 }
1292
1293 static gboolean
1294 need_quotes (const char *string)
1295 {
1296         gboolean quoted = FALSE;
1297         const char *inptr;
1298
1299         inptr = string;
1300
1301         while (*inptr) {
1302                 if (*inptr == '\\')
1303                         inptr++;
1304                 else if (*inptr == '"')
1305                         quoted = !quoted;
1306                 else if (!quoted && (is_tspecial (*inptr) || *inptr == '.'))
1307                         return TRUE;
1308
1309                 if (*inptr)
1310                         inptr++;
1311         }
1312
1313         return FALSE;
1314 }
1315
1316 /**
1317  * g_mime_utils_quote_string:
1318  * @str: input string
1319  *
1320  * Quotes @string as needed according to the rules in rfc2045.
1321  *
1322  * Returns: an allocated string containing the escaped and quoted (if
1323  * needed to be) input string. The decision to quote the string is
1324  * based on whether or not the input string contains any 'tspecials'
1325  * as defined by rfc2045.
1326  **/
1327 char *
1328 g_mime_utils_quote_string (const char *str)
1329 {
1330         gboolean quote;
1331         const char *c;
1332         char *qstring;
1333         GString *out;
1334
1335         out = g_string_new ("");
1336
1337         if ((quote = need_quotes (str)))
1338                 g_string_append_c (out, '"');
1339
1340         for (c = str; *c; c++) {
1341                 if ((*c == '"' && quote) || *c == '\\')
1342                         g_string_append_c (out, '\\');
1343
1344                 g_string_append_c (out, *c);
1345         }
1346
1347         if (quote)
1348                 g_string_append_c (out, '"');
1349
1350         qstring = out->str;
1351         g_string_free (out, FALSE);
1352
1353         return qstring;
1354 }
1355
1356
1357 /**
1358  * g_mime_utils_unquote_string:
1359  * @str: input string
1360  *
1361  * Unquotes and unescapes a string.
1362  **/
1363 void
1364 g_mime_utils_unquote_string (char *str)
1365 {
1366         /* if the string is quoted, unquote it */
1367         register char *inptr = str;
1368         int escaped = FALSE;
1369         int quoted = FALSE;
1370
1371         if (!str)
1372                 return;
1373
1374         while (*inptr) {
1375                 if (*inptr == '\\') {
1376                         if (escaped)
1377                                 *str++ = *inptr++;
1378                         else
1379                                 inptr++;
1380                         escaped = !escaped;
1381                 } else if (*inptr == '"') {
1382                         if (escaped) {
1383                                 *str++ = *inptr++;
1384                                 escaped = FALSE;
1385                         } else {
1386                                 quoted = !quoted;
1387                                 inptr++;
1388                         }
1389                 } else {
1390                         *str++ = *inptr++;
1391                         escaped = FALSE;
1392                 }
1393         }
1394
1395         *str = '\0';
1396 }
1397
1398
1399 /**
1400  * g_mime_utils_text_is_8bit:
1401  * @text: text to check for 8bit chars
1402  * @len: text length
1403  *
1404  * Determines if @text contains 8bit characters within the first @len
1405  * bytes.
1406  *
1407  * Returns: %TRUE if the text contains 8bit characters or %FALSE
1408  * otherwise.
1409  **/
1410 gboolean
1411 g_mime_utils_text_is_8bit (const unsigned char *text, size_t len)
1412 {
1413         register const unsigned char *inptr;
1414         const unsigned char *inend;
1415
1416         g_return_val_if_fail (text != NULL, FALSE);
1417
1418         inend = text + len;
1419         for (inptr = text; *inptr && inptr < inend; inptr++)
1420                 if (*inptr > (unsigned char) 127)
1421                         return TRUE;
1422
1423         return FALSE;
1424 }
1425
1426
1427 /**
1428  * g_mime_utils_best_encoding:
1429  * @text: text to encode
1430  * @len: text length
1431  *
1432  * Determines the best content encoding for the first @len bytes of
1433  * @text.
1434  *
1435  * Returns: a #GMimeContentEncoding that is determined to be the best
1436  * encoding type for the specified block of text. ("best" in this
1437  * particular case means smallest output size)
1438  **/
1439 GMimeContentEncoding
1440 g_mime_utils_best_encoding (const unsigned char *text, size_t len)
1441 {
1442         const unsigned char *ch, *inend;
1443         size_t count = 0;
1444
1445         inend = text + len;
1446         for (ch = text; ch < inend; ch++)
1447                 if (*ch > (unsigned char) 127)
1448                         count++;
1449
1450         if ((float) count <= len * 0.17)
1451                 return GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE;
1452         else
1453                 return GMIME_CONTENT_ENCODING_BASE64;
1454 }
1455
1456
1457 /**
1458  * charset_convert:
1459  * @cd: iconv converter
1460  * @inbuf: input text buffer to convert
1461  * @inleft: length of the input buffer
1462  * @outp: pointer to output buffer
1463  * @outlenp: pointer to output buffer length
1464  * @ninval: the number of invalid bytes in @inbuf
1465  *
1466  * Converts the input buffer from one charset to another using the
1467  * @cd. On completion, @outp will point to the output buffer
1468  * containing the converted text (nul-terminated), @outlenp will be
1469  * the size of the @outp buffer (note: not the strlen() of @outp) and
1470  * @ninval will contain the number of bytes which could not be
1471  * converted.
1472  *
1473  * Bytes which cannot be converted from @inbuf will appear as '?'
1474  * characters in the output buffer.
1475  *
1476  * If *@outp is non-NULL, then it is assumed that it points to a
1477  * pre-allocated buffer of length *@outlenp. This is done so that the
1478  * same output buffer can be reused multiple times.
1479  *
1480  * Returns: the string length of the output buffer.
1481  **/
1482 static size_t
1483 charset_convert (iconv_t cd, const char *inbuf, size_t inleft, char **outp, size_t *outlenp, size_t *ninval)
1484 {
1485         size_t outlen, outleft, rc, n = 0;
1486         char *outbuf, *out;
1487
1488         if (*outp == NULL) {
1489                 outleft = outlen = (inleft * 2) + 16;
1490                 outbuf = out = g_malloc (outlen + 1);
1491         } else {
1492                 outleft = outlen = *outlenp;
1493                 outbuf = out = *outp;
1494         }
1495
1496         do {
1497                 rc = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft);
1498                 if (rc == (size_t) -1) {
1499                         if (errno == EINVAL) {
1500                                 /* incomplete sequence at the end of the input buffer */
1501                                 n += inleft;
1502                                 break;
1503                         }
1504
1505 #ifdef G_OS_WIN32
1506                         /* seems that GnuWin32's libiconv 1.9 does not set errno in
1507                          * the E2BIG case, so we have to fake it */
1508                         if (outleft <= inleft)
1509                                 errno = E2BIG;
1510 #endif
1511
1512                         if (errno == E2BIG) {
1513                                 /* need to grow the output buffer */
1514                                 outlen += (inleft * 2) + 16;
1515                                 rc = (size_t) (outbuf - out);
1516                                 out = g_realloc (out, outlen + 1);
1517                                 outleft = outlen - rc;
1518                                 outbuf = out + rc;
1519                         } else {
1520                                 /* invalid byte(-sequence) in the input buffer */
1521                                 *outbuf++ = '?';
1522                                 outleft--;
1523                                 inleft--;
1524                                 inbuf++;
1525                                 n++;
1526                         }
1527                 }
1528         } while (inleft > 0);
1529
1530         iconv (cd, NULL, NULL, &outbuf, &outleft);
1531         *outbuf++ = '\0';
1532
1533         *outlenp = outlen;
1534         *outp = out;
1535         *ninval = n;
1536
1537         return (outbuf - out);
1538 }
1539
1540
1541 #define USER_CHARSETS_INCLUDE_UTF8    (1 << 0)
1542 #define USER_CHARSETS_INCLUDE_LOCALE  (1 << 1)
1543
1544
1545 /**
1546  * g_mime_utils_decode_8bit:
1547  * @text: input text in unknown 8bit/multibyte character set
1548  * @len: input text length
1549  *
1550  * Attempts to convert text in an unknown 8bit/multibyte charset into
1551  * UTF-8 by finding the charset which will convert the most bytes into
1552  * valid UTF-8 characters as possible. If no exact match can be found,
1553  * it will choose the best match and convert invalid byte sequences
1554  * into question-marks (?) in the returned string buffer.
1555  *
1556  * Returns: a UTF-8 string representation of @text.
1557  **/
1558 char *
1559 g_mime_utils_decode_8bit (const char *text, size_t len)
1560 {
1561         const char **charsets, **user_charsets, *locale, *best;
1562         size_t outleft, outlen, min, ninval;
1563         unsigned int included = 0;
1564         iconv_t cd;
1565         char *out;
1566         int i = 0;
1567
1568         g_return_val_if_fail (text != NULL, NULL);
1569
1570         locale = g_mime_locale_charset ();
1571         if (locale && !g_ascii_strcasecmp (locale, "UTF-8"))
1572                 included |= USER_CHARSETS_INCLUDE_LOCALE;
1573
1574         if ((user_charsets = g_mime_user_charsets ())) {
1575                 while (user_charsets[i])
1576                         i++;
1577         }
1578
1579         charsets = g_alloca (sizeof (char *) * (i + 3));
1580         i = 0;
1581
1582         if (user_charsets) {
1583                 while (user_charsets[i]) {
1584                         /* keep a record of whether or not the user-supplied
1585                          * charsets include UTF-8 and/or the default fallback
1586                          * charset so that we avoid doubling our efforts for
1587                          * these 2 charsets. We could have used a hash table
1588                          * to keep track of unique charsets, but we can
1589                          * (hopefully) assume that user_charsets is a unique
1590                          * list of charsets with no duplicates. */
1591                         if (!g_ascii_strcasecmp (user_charsets[i], "UTF-8"))
1592                                 included |= USER_CHARSETS_INCLUDE_UTF8;
1593
1594                         if (locale && !g_ascii_strcasecmp (user_charsets[i], locale))
1595                                 included |= USER_CHARSETS_INCLUDE_LOCALE;
1596
1597                         charsets[i] = user_charsets[i];
1598                         i++;
1599                 }
1600         }
1601
1602         if (!(included & USER_CHARSETS_INCLUDE_UTF8))
1603                 charsets[i++] = "UTF-8";
1604
1605         if (!(included & USER_CHARSETS_INCLUDE_LOCALE))
1606                 charsets[i++] = locale;
1607
1608         charsets[i] = NULL;
1609
1610         min = len;
1611         best = charsets[0];
1612
1613         outleft = (len * 2) + 16;
1614         out = g_malloc (outleft + 1);
1615
1616         for (i = 0; charsets[i]; i++) {
1617                 if ((cd = g_mime_iconv_open ("UTF-8", charsets[i])) == (iconv_t) -1)
1618                         continue;
1619
1620                 outlen = charset_convert (cd, text, len, &out, &outleft, &ninval);
1621
1622                 g_mime_iconv_close (cd);
1623
1624                 if (ninval == 0)
1625                         return g_realloc (out, outlen + 1);
1626
1627                 if (ninval < min) {
1628                         best = charsets[i];
1629                         min = ninval;
1630                 }
1631         }
1632
1633         /* if we get here, then none of the charsets fit the 8bit text flawlessly...
1634          * try to find the one that fit the best and use that to convert what we can,
1635          * replacing any byte we can't convert with a '?' */
1636
1637         if ((cd = g_mime_iconv_open ("UTF-8", best)) == (iconv_t) -1) {
1638                 /* this shouldn't happen... but if we are here, then
1639                  * it did...  the only thing we can do at this point
1640                  * is replace the 8bit garbage and pray */
1641                 register const char *inptr = text;
1642                 const char *inend = inptr + len;
1643                 char *outbuf = out;
1644
1645                 while (inptr < inend) {
1646                         if (is_ascii (*inptr))
1647                                 *outbuf++ = *inptr++;
1648                         else
1649                                 *outbuf++ = '?';
1650                 }
1651
1652                 *outbuf++ = '\0';
1653
1654                 return g_realloc (out, (size_t) (outbuf - out));
1655         }
1656
1657         outlen = charset_convert (cd, text, len, &out, &outleft, &ninval);
1658
1659         g_mime_iconv_close (cd);
1660
1661         return g_realloc (out, outlen + 1);
1662 }
1663
1664
1665 /* this decodes rfc2047's version of quoted-printable */
1666 static ssize_t
1667 quoted_decode (const unsigned char *in, size_t len, unsigned char *out)
1668 {
1669         register const unsigned char *inptr;
1670         register unsigned char *outptr;
1671         const unsigned char *inend;
1672         unsigned char c, c1;
1673
1674         inend = in + len;
1675         outptr = out;
1676
1677         inptr = in;
1678         while (inptr < inend) {
1679                 c = *inptr++;
1680                 if (c == '=') {
1681                         if (inend - inptr >= 2) {
1682                                 c = toupper (*inptr++);
1683                                 c1 = toupper (*inptr++);
1684                                 *outptr++ = (((c >= 'A' ? c - 'A' + 10 : c - '0') & 0x0f) << 4)
1685                                         | ((c1 >= 'A' ? c1 - 'A' + 10 : c1 - '0') & 0x0f);
1686                         } else {
1687                                 /* data was truncated */
1688                                 return -1;
1689                         }
1690                 } else if (c == '_') {
1691                         /* _'s are an rfc2047 shortcut for encoding spaces */
1692                         *outptr++ = ' ';
1693                 } else {
1694                         *outptr++ = c;
1695                 }
1696         }
1697
1698         return (ssize_t) (outptr - out);
1699 }
1700
1701 #define is_rfc2047_encoded_word(atom, len) (len >= 7 && !strncmp (atom, "=?", 2) && !strncmp (atom + len - 2, "?=", 2))
1702
1703 static char *
1704 rfc2047_decode_word (const char *in, size_t inlen)
1705 {
1706         const unsigned char *instart = (const unsigned char *) in;
1707         const register unsigned char *inptr = instart + 2;
1708         const unsigned char *inend = instart + inlen - 2;
1709         unsigned char *decoded;
1710         const char *charset;
1711         size_t len, ninval;
1712         char *charenc, *p;
1713         guint32 save = 0;
1714         ssize_t declen;
1715         int state = 0;
1716         iconv_t cd;
1717         char *buf;
1718
1719         /* skip over the charset */
1720         if (!(inptr = memchr (inptr, '?', inend - inptr)) || inptr[2] != '?')
1721                 return NULL;
1722
1723         inptr++;
1724
1725         switch (*inptr) {
1726         case 'B':
1727         case 'b':
1728                 inptr += 2;
1729                 len = (size_t) (inend - inptr);
1730                 decoded = g_alloca (len);
1731                 declen = g_mime_encoding_base64_decode_step (inptr, len, decoded, &state, &save);
1732
1733                 if (declen == -1) {
1734                         d(fprintf (stderr, "encountered broken 'Q' encoding\n"));
1735                         return NULL;
1736                 }
1737                 break;
1738         case 'Q':
1739         case 'q':
1740                 inptr += 2;
1741                 len = (size_t) (inend - inptr);
1742                 decoded = g_alloca (len);
1743                 declen = quoted_decode (inptr, len, decoded);
1744
1745                 if (declen == -1) {
1746                         d(fprintf (stderr, "encountered broken 'Q' encoding\n"));
1747                         return NULL;
1748                 }
1749                 break;
1750         default:
1751                 d(fprintf (stderr, "unknown encoding\n"));
1752                 return NULL;
1753         }
1754
1755         len = (inptr - 3) - (instart + 2);
1756         charenc = g_alloca (len + 1);
1757         memcpy (charenc, in + 2, len);
1758         charenc[len] = '\0';
1759         charset = charenc;
1760
1761         /* rfc2231 updates rfc2047 encoded words...
1762          * The ABNF given in RFC 2047 for encoded-words is:
1763          *   encoded-word := "=?" charset "?" encoding "?" encoded-text "?="
1764          * This specification changes this ABNF to:
1765          *   encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?="
1766          */
1767
1768         /* trim off the 'language' part if it's there... */
1769         if ((p = strchr (charset, '*')))
1770                 *p = '\0';
1771
1772         /* slight optimization? */
1773         if (!g_ascii_strcasecmp (charset, "UTF-8")) {
1774                 p = (char *) decoded;
1775                 len = declen;
1776
1777                 //while (!g_utf8_validate (p, len, (const char **) &p)) {
1778                 //      len = declen - (p - (char *) decoded);
1779                 //      *p = '?';
1780                 //}
1781
1782                 return g_strndup ((char *) decoded, declen);
1783         }
1784
1785         if (!charset[0] || (cd = g_mime_iconv_open ("UTF-8", charset)) == (iconv_t) -1) {
1786                 w(g_warning ("Cannot convert from %s to UTF-8, header display may "
1787                              "be corrupt: %s", charset[0] ? charset : "unspecified charset",
1788                              g_strerror (errno)));
1789
1790                 return g_mime_utils_decode_8bit ((char *) decoded, declen);
1791         }
1792
1793         len = declen;
1794         buf = g_malloc (len + 1);
1795
1796         charset_convert (cd, (char *) decoded, declen, &buf, &len, &ninval);
1797
1798         g_mime_iconv_close (cd);
1799
1800 #if w(!)0
1801         if (ninval > 0) {
1802                 g_warning ("Failed to completely convert \"%.*s\" to UTF-8, display may be "
1803                            "corrupt: %s", declen, decoded, g_strerror (errno));
1804         }
1805 #endif
1806
1807         return buf;
1808 }
1809
1810
1811 /**
1812  * g_mime_utils_header_decode_text:
1813  * @text: header text to decode
1814  *
1815  * Decodes an rfc2047 encoded 'text' header.
1816  *
1817  * Note: See g_mime_set_user_charsets() for details on how charset
1818  * conversion is handled for unencoded 8bit text and/or wrongly
1819  * specified rfc2047 encoded-word tokens.
1820  *
1821  * Returns: a newly allocated UTF-8 string representing the the decoded
1822  * header.
1823  **/
1824 char *
1825 g_mime_utils_header_decode_text (const char *text)
1826 {
1827         gboolean enable_rfc2047_workarounds = _g_mime_enable_rfc2047_workarounds ();
1828         register const char *inptr = text;
1829         gboolean encoded = FALSE;
1830         const char *lwsp, *word;
1831         size_t nlwsp, n;
1832         gboolean ascii;
1833         char *decoded;
1834         GString *out;
1835
1836         if (text == NULL)
1837                 return g_strdup ("");
1838
1839         out = g_string_sized_new (strlen (text) + 1);
1840
1841         while (*inptr != '\0') {
1842                 lwsp = inptr;
1843                 while (is_lwsp (*inptr))
1844                         inptr++;
1845
1846                 nlwsp = (size_t) (inptr - lwsp);
1847
1848                 if (*inptr != '\0') {
1849                         word = inptr;
1850                         ascii = TRUE;
1851
1852                         if (enable_rfc2047_workarounds) {
1853                                 if (!strncmp (inptr, "=?", 2)) {
1854                                         inptr += 2;
1855
1856                                         /* skip past the charset (if one is even declared, sigh) */
1857                                         while (*inptr && *inptr != '?') {
1858                                                 ascii = ascii && is_ascii (*inptr);
1859                                                 inptr++;
1860                                         }
1861
1862                                         /* sanity check encoding type */
1863                                         if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
1864                                                 goto non_rfc2047;
1865
1866                                         inptr += 3;
1867
1868                                         /* find the end of the rfc2047 encoded word token */
1869                                         while (*inptr && strncmp (inptr, "?=", 2) != 0) {
1870                                                 ascii = ascii && is_ascii (*inptr);
1871                                                 inptr++;
1872                                         }
1873
1874                                         if (!strncmp (inptr, "?=", 2))
1875                                                 inptr += 2;
1876                                 } else {
1877                                 non_rfc2047:
1878                                         /* stop if we encounter a possible rfc2047 encoded
1879                                          * token even if it's inside another word, sigh. */
1880                                         while (*inptr && !is_lwsp (*inptr) &&
1881                                                strncmp (inptr, "=?", 2) != 0) {
1882                                                 ascii = ascii && is_ascii (*inptr);
1883                                                 inptr++;
1884                                         }
1885                                 }
1886                         } else {
1887                                 while (*inptr && !is_lwsp (*inptr)) {
1888                                         ascii = ascii && is_ascii (*inptr);
1889                                         inptr++;
1890                                 }
1891                         }
1892
1893                         n = (size_t) (inptr - word);
1894                         if (is_rfc2047_encoded_word (word, n)) {
1895                                 if ((decoded = rfc2047_decode_word (word, n))) {
1896                                         /* rfc2047 states that you must ignore all
1897                                          * whitespace between encoded words */
1898                                         if (!encoded)
1899                                                 g_string_append_len (out, lwsp, nlwsp);
1900
1901                                         g_string_append (out, decoded);
1902                                         g_free (decoded);
1903
1904                                         encoded = TRUE;
1905                                 } else {
1906                                         /* append lwsp and invalid rfc2047 encoded-word token */
1907                                         g_string_append_len (out, lwsp, nlwsp + n);
1908                                         encoded = FALSE;
1909                                 }
1910                         } else {
1911                                 /* append lwsp */
1912                                 g_string_append_len (out, lwsp, nlwsp);
1913
1914                                 /* append word token */
1915                                 if (!ascii) {
1916                                         /* *sigh* I hate broken mailers... */
1917                                         decoded = g_mime_utils_decode_8bit (word, n);
1918                                         g_string_append (out, decoded);
1919                                         g_free (decoded);
1920                                 } else {
1921                                         g_string_append_len (out, word, n);
1922                                 }
1923
1924                                 encoded = FALSE;
1925                         }
1926                 } else {
1927                         /* appending trailing lwsp */
1928                         g_string_append_len (out, lwsp, nlwsp);
1929                         break;
1930                 }
1931         }
1932
1933         decoded = out->str;
1934         g_string_free (out, FALSE);
1935
1936         return decoded;
1937 }
1938
1939
1940 /**
1941  * g_mime_utils_header_decode_phrase:
1942  * @phrase: header to decode
1943  *
1944  * Decodes an rfc2047 encoded 'phrase' header.
1945  *
1946  * Note: See g_mime_set_user_charsets() for details on how charset
1947  * conversion is handled for unencoded 8bit text and/or wrongly
1948  * specified rfc2047 encoded-word tokens.
1949  *
1950  * Returns: a newly allocated UTF-8 string representing the the decoded
1951  * header.
1952  **/
1953 char *
1954 g_mime_utils_header_decode_phrase (const char *phrase)
1955 {
1956         register const char *inptr = phrase;
1957         gboolean encoded = FALSE;
1958         const char *lwsp, *text;
1959         size_t nlwsp, n;
1960         gboolean ascii;
1961         char *decoded;
1962         GString *out;
1963
1964         if (phrase == NULL)
1965                 return g_strdup ("");
1966
1967         out = g_string_sized_new (strlen (phrase) + 1);
1968
1969         while (*inptr != '\0') {
1970                 lwsp = inptr;
1971                 while (is_lwsp (*inptr))
1972                         inptr++;
1973
1974                 nlwsp = (size_t) (inptr - lwsp);
1975
1976                 text = inptr;
1977                 if (is_atom (*inptr)) {
1978                         while (is_atom (*inptr))
1979                                 inptr++;
1980
1981                         n = (size_t) (inptr - text);
1982                         if (is_rfc2047_encoded_word (text, n)) {
1983                                 if ((decoded = rfc2047_decode_word (text, n))) {
1984                                         /* rfc2047 states that you must ignore all
1985                                          * whitespace between encoded words */
1986                                         if (!encoded)
1987                                                 g_string_append_len (out, lwsp, nlwsp);
1988
1989                                         g_string_append (out, decoded);
1990                                         g_free (decoded);
1991
1992                                         encoded = TRUE;
1993                                 } else {
1994                                         /* append lwsp and invalid rfc2047 encoded-word token */
1995                                         g_string_append_len (out, lwsp, nlwsp + n);
1996                                         encoded = FALSE;
1997                                 }
1998                         } else {
1999                                 /* append lwsp and atom token */
2000                                 g_string_append_len (out, lwsp, nlwsp + n);
2001                                 encoded = FALSE;
2002                         }
2003                 } else {
2004                         g_string_append_len (out, lwsp, nlwsp);
2005
2006                         ascii = TRUE;
2007                         while (*inptr && !is_lwsp (*inptr)) {
2008                                 ascii = ascii && is_ascii (*inptr);
2009                                 inptr++;
2010                         }
2011
2012                         n = (size_t) (inptr - text);
2013
2014                         if (!ascii) {
2015                                 /* *sigh* I hate broken mailers... */
2016                                 decoded = g_mime_utils_decode_8bit (text, n);
2017                                 g_string_append (out, decoded);
2018                                 g_free (decoded);
2019                         } else {
2020                                 g_string_append_len (out, text, n);
2021                         }
2022
2023                         encoded = FALSE;
2024                 }
2025         }
2026
2027         decoded = out->str;
2028         g_string_free (out, FALSE);
2029
2030         return decoded;
2031 }
2032
2033
2034 /* rfc2047 version of quoted-printable */
2035 static size_t
2036 quoted_encode (const char *in, size_t len, unsigned char *out, gushort safemask)
2037 {
2038         register const unsigned char *inptr = (const unsigned char *) in;
2039         const unsigned char *inend = inptr + len;
2040         register unsigned char *outptr = out;
2041         unsigned char c;
2042
2043         while (inptr < inend) {
2044                 c = *inptr++;
2045                 if (c == ' ') {
2046                         *outptr++ = '_';
2047                 } else if (c != '_' && gmime_special_table[c] & safemask) {
2048                         *outptr++ = c;
2049                 } else {
2050                         *outptr++ = '=';
2051                         *outptr++ = tohex[(c >> 4) & 0xf];
2052                         *outptr++ = tohex[c & 0xf];
2053                 }
2054         }
2055
2056         return (outptr - out);
2057 }
2058
2059 static void
2060 rfc2047_encode_word (GString *string, const char *word, size_t len,
2061                      const char *charset, gushort safemask)
2062 {
2063         register char *inptr, *outptr;
2064         iconv_t cd = (iconv_t) -1;
2065         unsigned char *encoded;
2066         size_t enclen, pos;
2067         char *uword = NULL;
2068         guint32 save = 0;
2069         int state = 0;
2070         char encoding;
2071
2072         if (g_ascii_strcasecmp (charset, "UTF-8") != 0)
2073                 cd = g_mime_iconv_open (charset, "UTF-8");
2074
2075         if (cd != (iconv_t) -1) {
2076                 uword = g_mime_iconv_strndup (cd, (char *) word, len);
2077                 g_mime_iconv_close (cd);
2078         }
2079
2080         if (uword) {
2081                 len = strlen (uword);
2082                 word = uword;
2083         } else {
2084                 charset = "UTF-8";
2085         }
2086
2087         switch (g_mime_utils_best_encoding ((const unsigned char *) word, len)) {
2088         case GMIME_CONTENT_ENCODING_BASE64:
2089                 enclen = GMIME_BASE64_ENCODE_LEN (len);
2090                 encoded = g_alloca (enclen + 1);
2091
2092                 encoding = 'b';
2093
2094                 pos = g_mime_encoding_base64_encode_close ((const unsigned char *) word, len, encoded, &state, &save);
2095                 encoded[pos] = '\0';
2096
2097                 /* remove \n chars as headers need to be wrapped differently */
2098                 if (G_UNLIKELY ((inptr = strchr ((char *) encoded, '\n')))) {
2099                         outptr = inptr++;
2100                         while (G_LIKELY (*inptr)) {
2101                                 if (G_LIKELY (*inptr != '\n'))
2102                                         *outptr++ = *inptr;
2103
2104                                 inptr++;
2105                         }
2106
2107                         *outptr = '\0';
2108                 }
2109
2110                 break;
2111         case GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE:
2112                 enclen = GMIME_QP_ENCODE_LEN (len);
2113                 encoded = g_alloca (enclen + 1);
2114
2115                 encoding = 'q';
2116
2117                 pos = quoted_encode (word, len, encoded, safemask);
2118                 encoded[pos] = '\0';
2119
2120                 break;
2121         default:
2122                 encoded = NULL;
2123                 encoding = '\0';
2124                 g_assert_not_reached ();
2125         }
2126
2127         g_free (uword);
2128
2129         g_string_append_printf (string, "=?%s?%c?%s?=", charset, encoding, encoded);
2130 }
2131
2132
2133 typedef enum {
2134         WORD_ATOM,
2135         WORD_QSTRING,
2136         WORD_2047
2137 } rfc822_word_t;
2138
2139 typedef struct _rfc822_word {
2140         struct _rfc822_word *next;
2141         const char *start, *end;
2142         rfc822_word_t type;
2143         int encoding;
2144 } rfc822_word;
2145
2146 #define rfc822_word_free(word) g_slice_free (rfc822_word, word)
2147 #define rfc822_word_new() g_slice_new (rfc822_word)
2148
2149 /* okay, so 'unstructured text' fields don't actually contain 'word'
2150  * tokens, but we can group stuff similarly... */
2151 static rfc822_word *
2152 rfc2047_encode_get_rfc822_words (const char *in, gboolean phrase)
2153 {
2154         rfc822_word *words, *tail, *word;
2155         rfc822_word_t type = WORD_ATOM;
2156         const char *inptr, *start, *last;
2157         int count = 0, encoding = 0;
2158
2159         words = NULL;
2160         tail = (rfc822_word *) &words;
2161
2162         last = start = inptr = in;
2163         while (inptr && *inptr) {
2164                 const char *newinptr;
2165                 gunichar c;
2166
2167                 newinptr = g_utf8_next_char (inptr);
2168                 c = g_utf8_get_char (inptr);
2169                 if (newinptr == NULL || !g_unichar_validate (c)) {
2170                         w(g_warning ("Invalid UTF-8 sequence encountered"));
2171                         inptr++;
2172                         continue;
2173                 }
2174
2175                 inptr = newinptr;
2176
2177                 if (c < 256 && is_lwsp (c)) {
2178                         if (count > 0) {
2179                                 word = rfc822_word_new ();
2180                                 word->next = NULL;
2181                                 word->start = start;
2182                                 word->end = last;
2183                                 word->type = type;
2184                                 word->encoding = encoding;
2185
2186                                 tail->next = word;
2187                                 tail = word;
2188                                 count = 0;
2189                         }
2190
2191                         start = inptr;
2192                         type = WORD_ATOM;
2193                         encoding = 0;
2194                 } else {
2195                         count++;
2196                         if (phrase && c < 128) {
2197                                 /* phrases can have qstring words */
2198                                 if (!is_atom (c))
2199                                         type = MAX (type, WORD_QSTRING);
2200                         } else if (c > 127 && c < 256) {
2201                                 type = WORD_2047;
2202                                 encoding = MAX (encoding, 1);
2203                         } else if (c >= 256) {
2204                                 type = WORD_2047;
2205                                 encoding = 2;
2206                         }
2207
2208                         if (count >= GMIME_FOLD_PREENCODED) {
2209                                 word = rfc822_word_new ();
2210                                 word->next = NULL;
2211                                 word->start = start;
2212                                 word->end = inptr;
2213                                 word->type = type;
2214                                 word->encoding = encoding;
2215
2216                                 tail->next = word;
2217                                 tail = word;
2218                                 count = 0;
2219
2220                                 /* Note: don't reset 'type' as it
2221                                  * needs to be preserved when breaking
2222                                  * long words */
2223                                 start = inptr;
2224                                 encoding = 0;
2225                         }
2226                 }
2227
2228                 last = inptr;
2229         }
2230
2231         if (count > 0) {
2232                 word = rfc822_word_new ();
2233                 word->next = NULL;
2234                 word->start = start;
2235                 word->end = last;
2236                 word->type = type;
2237                 word->encoding = encoding;
2238
2239                 tail->next = word;
2240                 tail = word;
2241         }
2242
2243 #if d(!)0
2244         printf ("rfc822 word tokens:\n");
2245         word = words;
2246         while (word) {
2247                 printf ("\t'%.*s'; type=%d, encoding=%d\n",
2248                         word->end - word->start, word->start,
2249                         word->type, word->encoding);
2250
2251                 word = word->next;
2252         }
2253 #endif
2254
2255         return words;
2256 }
2257
2258 #define MERGED_WORD_LT_FOLDLEN(wlen, type) ((type) == WORD_2047 ? (wlen) < GMIME_FOLD_PREENCODED : (wlen) < (GMIME_FOLD_LEN - 8))
2259
2260 static gboolean
2261 should_merge_words (rfc822_word *word, rfc822_word *next)
2262 {
2263         switch (word->type) {
2264         case WORD_ATOM:
2265                 if (next->type == WORD_2047)
2266                         return FALSE;
2267
2268                 return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, next->type));
2269         case WORD_QSTRING:
2270                 /* avoid merging with words that need to be rfc2047 encoded */
2271                 if (next->type == WORD_2047)
2272                         return FALSE;
2273
2274                 return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, WORD_QSTRING));
2275         case WORD_2047:
2276                 if (next->type == WORD_ATOM) {
2277                         /* whether we merge or not is dependent upon:
2278                          * 1. the number of atoms in a row after 'word'
2279                          * 2. if there is another encword after the string of atoms.
2280                          */
2281                         int natoms = 0;
2282
2283                         while (next && next->type == WORD_ATOM) {
2284                                 next = next->next;
2285                                 natoms++;
2286                         }
2287
2288                         /* if all the words after the encword are atoms, don't merge */
2289                         if (!next || natoms > 3)
2290                                 return FALSE;
2291                 }
2292
2293                 /* avoid merging with qstrings */
2294                 if (next->type == WORD_QSTRING)
2295                         return FALSE;
2296
2297                 return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, WORD_2047));
2298         default:
2299                 return FALSE;
2300         }
2301 }
2302
2303 static void
2304 rfc2047_encode_merge_rfc822_words (rfc822_word **wordsp)
2305 {
2306         rfc822_word *word, *next, *words = *wordsp;
2307
2308         /* first pass: merge qstrings with adjacent qstrings and encwords with adjacent encwords */
2309         word = words;
2310         while (word && word->next) {
2311                 next = word->next;
2312
2313                 if (word->type != WORD_ATOM && word->type == next->type &&
2314                     MERGED_WORD_LT_FOLDLEN (next->end - word->start, word->type)) {
2315                         /* merge the words */
2316                         word->encoding = MAX (word->encoding, next->encoding);
2317
2318                         word->end = next->end;
2319                         word->next = next->next;
2320
2321                         rfc822_word_free (next);
2322
2323                         next = word;
2324                 }
2325
2326                 word = next;
2327         }
2328
2329         /* second pass: now merge atoms with the other words */
2330         word = words;
2331         while (word && word->next) {
2332                 next = word->next;
2333
2334                 if (should_merge_words (word, next)) {
2335                         /* the resulting word type is the MAX of the 2 types */
2336                         word->type = MAX (word->type, next->type);
2337
2338                         word->encoding = MAX (word->encoding, next->encoding);
2339
2340                         word->end = next->end;
2341                         word->next = next->next;
2342
2343                         rfc822_word_free (next);
2344
2345                         continue;
2346                 }
2347
2348                 word = next;
2349         }
2350
2351         *wordsp = words;
2352 }
2353
2354 static void
2355 g_string_append_len_quoted (GString *out, const char *in, size_t len)
2356 {
2357         register const char *inptr;
2358         const char *inend;
2359
2360         g_string_append_c (out, '"');
2361
2362         inptr = in;
2363         inend = in + len;
2364
2365         while (inptr < inend) {
2366                 if (*inptr == '"' || *inptr == '\\')
2367                         g_string_append_c (out, '\\');
2368
2369                 g_string_append_c (out, *inptr);
2370
2371                 inptr++;
2372         }
2373
2374         g_string_append_c (out, '"');
2375 }
2376
2377 static char *
2378 rfc2047_encode (const char *in, gushort safemask)
2379 {
2380         rfc822_word *words, *word, *prev = NULL;
2381         const char **charsets, *charset;
2382         const char *start;
2383         GMimeCharset mask;
2384         GString *out;
2385         char *outstr;
2386         size_t len;
2387         int i;
2388
2389         if (!(words = rfc2047_encode_get_rfc822_words (in, safemask & IS_PSAFE)))
2390                 return g_strdup (in);
2391
2392         rfc2047_encode_merge_rfc822_words (&words);
2393
2394         charsets = g_mime_user_charsets ();
2395
2396         out = g_string_new ("");
2397
2398         /* output words now with spaces between them */
2399         word = words;
2400         while (word) {
2401                 /* append correct number of spaces between words */
2402                 if (prev && !(prev->type == WORD_2047 && word->type == WORD_2047)) {
2403                         /* one or both of the words are not encoded so we write the spaces out untouched */
2404                         len = word->start - prev->end;
2405                         g_string_append_len (out, prev->end, len);
2406                 }
2407
2408                 switch (word->type) {
2409                 case WORD_ATOM:
2410                         g_string_append_len (out, word->start, (size_t) (word->end - word->start));
2411                         break;
2412                 case WORD_QSTRING:
2413                         g_assert (safemask & IS_PSAFE);
2414                         g_string_append_len_quoted (out, word->start, (size_t) (word->end - word->start));
2415                         break;
2416                 case WORD_2047:
2417                         if (prev && prev->type == WORD_2047) {
2418                                 /* include the whitespace chars between these 2 words in the
2419                                    resulting rfc2047 encoded word. */
2420                                 len = word->end - prev->end;
2421                                 start = prev->end;
2422
2423                                 /* encoded words need to be separated by linear whitespace */
2424                                 g_string_append_c (out, ' ');
2425                         } else {
2426                                 len = word->end - word->start;
2427                                 start = word->start;
2428                         }
2429
2430                         switch (word->encoding) {
2431                         case 0: /* us-ascii */
2432                                 rfc2047_encode_word (out, start, len, "us-ascii", safemask);
2433                                 break;
2434                         case 1: /* iso-8859-1 */
2435                                 rfc2047_encode_word (out, start, len, "iso-8859-1", safemask);
2436                                 break;
2437                         default:
2438                                 charset = NULL;
2439                                 g_mime_charset_init (&mask);
2440                                 g_mime_charset_step (&mask, start, len);
2441
2442                                 for (i = 0; charsets && charsets[i]; i++) {
2443                                         if (g_mime_charset_can_encode (&mask, charsets[i], start, len)) {
2444                                                 charset = charsets[i];
2445                                                 break;
2446                                         }
2447                                 }
2448
2449                                 if (!charset)
2450                                         charset = g_mime_charset_best_name (&mask);
2451
2452                                 rfc2047_encode_word (out, start, len, charset, safemask);
2453                                 break;
2454                         }
2455
2456                         break;
2457                 }
2458
2459                 rfc822_word_free (prev);
2460
2461                 prev = word;
2462                 word = word->next;
2463         }
2464
2465         rfc822_word_free (prev);
2466
2467         outstr = out->str;
2468         g_string_free (out, FALSE);
2469
2470         return outstr;
2471 }
2472
2473
2474 /**
2475  * g_mime_utils_header_encode_phrase:
2476  * @phrase: phrase to encode
2477  *
2478  * Encodes a 'phrase' header according to the rules in rfc2047.
2479  *
2480  * Returns: the encoded 'phrase'. Useful for encoding internet
2481  * addresses.
2482  **/
2483 char *
2484 g_mime_utils_header_encode_phrase (const char *phrase)
2485 {
2486         if (phrase == NULL)
2487                 return NULL;
2488
2489         return rfc2047_encode (phrase, IS_PSAFE);
2490 }
2491
2492
2493 /**
2494  * g_mime_utils_header_encode_text:
2495  * @text: text to encode
2496  *
2497  * Encodes a 'text' header according to the rules in rfc2047.
2498  *
2499  * Returns: the encoded header. Useful for encoding
2500  * headers like "Subject".
2501  **/
2502 char *
2503 g_mime_utils_header_encode_text (const char *text)
2504 {
2505         if (text == NULL)
2506                 return NULL;
2507
2508         return rfc2047_encode (text, IS_ESAFE);
2509 }