git.notmuchmail.org Git - notmuch/blob - date.c

   1 /* date.c - Date-parsing utility for the notmuch mail system.
   2  *
   3  *  Copyright © 2000-2009 Jeffrey Stedfast
   4  *
   5  * This program is free software: you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation, either version 3 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program.  If not, see http://www.gnu.org/licenses/
  17  */
  18
  19 /* This code was originally written by from Jeffrey Stedfast
  20  * as part of his GMime library (http://spruce.sourceforge.net/gmime/)
  21  *
  22  * Carl Worth <cworth@cworth.org> imported it into notmuch and removed
  23  * some glib-isms.
  24  */
  25
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #define _GNU_SOURCE
  31
  32 #include <glib.h>
  33
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <string.h>
  37 #ifdef HAVE_SYS_PARAM_H
  38 #include <sys/param.h>      /* for MAXHOSTNAMELEN */
  39 #else
  40 #define MAXHOSTNAMELEN 64
  41 #endif
  42 #ifdef HAVE_UTSNAME_DOMAINNAME
  43 #include <sys/utsname.h>    /* for uname() */
  44 #endif
  45 #include <sys/types.h>
  46 #ifdef HAVE_UNISTD_H
  47 #include <unistd.h>         /* Unix header for getpid() */
  48 #endif
  49 #ifdef G_OS_WIN32
  50 #include <winsock2.h>
  51 #include <ws2tcpip.h>
  52 #include <process.h>
  53 #define getpid() _getpid()
  54 #endif
  55 #ifdef HAVE_NETDB_H
  56 #include <netdb.h>
  57 #endif
  58 #include <ctype.h>
  59 #include <errno.h>
  60
  61 #include "gmime-utils.h"
  62 #include "gmime-table-private.h"
  63 #include "gmime-parse-utils.h"
  64 #include "gmime-part.h"
  65 #include "gmime-charset.h"
  66 #include "gmime-iconv.h"
  67 #include "gmime-iconv-utils.h"
  68
  69 #ifdef ENABLE_WARNINGS
  70 #define w(x) x
  71 #else
  72 #define w(x)
  73 #endif /* ENABLE_WARNINGS */
  74
  75 #define d(x)
  76
  77
  78 /**
  79  * SECTION: gmime-utils
  80  * @title: gmime-utils
  81  * @short_description: MIME utility functions
  82  * @see_also:
  83  *
  84  * Utility functions to parse, encode and decode various MIME tokens
  85  * and encodings.
  86  **/
  87
  88 extern gboolean _g_mime_enable_rfc2047_workarounds (void);
  89
  90 #define GMIME_FOLD_PREENCODED  (GMIME_FOLD_LEN / 2)
  91
  92 /* date parser macros */
  93 #define NUMERIC_CHARS          "1234567890"
  94 #define WEEKDAY_CHARS          "SundayMondayTuesdayWednesdayThursdayFridaySaturday"
  95 #define MONTH_CHARS            "JanuaryFebruaryMarchAprilMayJuneJulyAugustSeptemberOctoberNovemberDecember"
  96 #define TIMEZONE_ALPHA_CHARS   "UTCGMTESTEDTCSTCDTMSTPSTPDTZAMNY()"
  97 #define TIMEZONE_NUMERIC_CHARS "-+1234567890"
  98 #define TIME_CHARS             "1234567890:"
  99
 100 #define DATE_TOKEN_NON_NUMERIC          (1 << 0)
 101 #define DATE_TOKEN_NON_WEEKDAY          (1 << 1)
 102 #define DATE_TOKEN_NON_MONTH            (1 << 2)
 103 #define DATE_TOKEN_NON_TIME             (1 << 3)
 104 #define DATE_TOKEN_HAS_COLON            (1 << 4)
 105 #define DATE_TOKEN_NON_TIMEZONE_ALPHA   (1 << 5)
 106 #define DATE_TOKEN_NON_TIMEZONE_NUMERIC (1 << 6)
 107 #define DATE_TOKEN_HAS_SIGN             (1 << 7)
 108
 109 static unsigned char tohex[16] = {
 110         '0', '1', '2', '3', '4', '5', '6', '7',
 111         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
 112 };
 113
 114 static unsigned char gmime_datetok_table[256] = {
 115         128,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 116         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 117         111,111,111,111,111,111,111,111, 79, 79,111,175,111,175,111,111,
 118          38, 38, 38, 38, 38, 38, 38, 38, 38, 38,119,111,111,111,111,111,
 119         111, 75,111, 79, 75, 79,105, 79,111,111,107,111,111, 73, 75,107,
 120          79,111,111, 73, 77, 79,111,109,111, 79, 79,111,111,111,111,111,
 121         111,105,107,107,109,105,111,107,105,105,111,111,107,107,105,105,
 122         107,111,105,105,105,105,107,111,111,105,111,111,111,111,111,111,
 123         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 124         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 125         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 126         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 127         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 128         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 129         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 130         111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
 131 };
 132
 133 /* hrm, is there a library for this shit? */
 134 static struct {
 135         char *name;
 136         int offset;
 137 } tz_offsets [] = {
 138         { "UT", 0 },
 139         { "GMT", 0 },
 140         { "EST", -500 },        /* these are all US timezones.  bloody yanks */
 141         { "EDT", -400 },
 142         { "CST", -600 },
 143         { "CDT", -500 },
 144         { "MST", -700 },
 145         { "MDT", -600 },
 146         { "PST", -800 },
 147         { "PDT", -700 },
 148         { "Z", 0 },
 149         { "A", -100 },
 150         { "M", -1200 },
 151         { "N", 100 },
 152         { "Y", 1200 },
 153 };
 154
 155 static char *tm_months[] = {
 156         "Jan", "Feb", "Mar", "Apr", "May", "Jun",
 157         "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
 158 };
 159
 160 static char *tm_days[] = {
 161         "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
 162 };
 163
 164
 165 /**
 166  * g_mime_utils_header_format_date:
 167  * @date: time_t date representation
 168  * @tz_offset: Timezone offset
 169  *
 170  * Allocates a string buffer containing the rfc822 formatted date
 171  * string represented by @time and @tz_offset.
 172  *
 173  * Returns: a valid string representation of the date.
 174  **/
 175 char *
 176 g_mime_utils_header_format_date (time_t date, int tz_offset)
 177 {
 178         struct tm tm;
 179
 180         date += ((tz_offset / 100) * (60 * 60)) + (tz_offset % 100) * 60;
 181
 182 #if defined (HAVE_GMTIME_R)
 183         gmtime_r (&date, &tm);
 184 #elif defined (HAVE_GMTIME_S)
 185         gmtime_s (&tm, &date);
 186 #else
 187         memcpy (&tm, gmtime (&date), sizeof (tm));
 188 #endif
 189
 190         return g_strdup_printf ("%s, %02d %s %04d %02d:%02d:%02d %+05d",
 191                                 tm_days[tm.tm_wday], tm.tm_mday,
 192                                 tm_months[tm.tm_mon],
 193                                 tm.tm_year + 1900,
 194                                 tm.tm_hour, tm.tm_min, tm.tm_sec,
 195                                 tz_offset);
 196 }
 197
 198 /* This is where it gets ugly... */
 199
 200 typedef struct _date_token {
 201         struct _date_token *next;
 202         unsigned char mask;
 203         const char *start;
 204         size_t len;
 205 } date_token;
 206
 207 #define date_token_free(tok) g_slice_free (date_token, tok)
 208 #define date_token_new() g_slice_new (date_token)
 209
 210 static date_token *
 211 datetok (const char *date)
 212 {
 213         date_token *tokens = NULL, *token, *tail = (date_token *) &tokens;
 214         const char *start, *end;
 215         unsigned char mask;
 216
 217         start = date;
 218         while (*start) {
 219                 /* kill leading whitespace */
 220                 while (*start == ' ' || *start == '\t')
 221                         start++;
 222
 223                 if (*start == '\0')
 224                         break;
 225
 226                 mask = gmime_datetok_table[(unsigned char) *start];
 227
 228                 /* find the end of this token */
 229                 end = start + 1;
 230                 while (*end && !strchr ("-/,\t\r\n ", *end))
 231                         mask |= gmime_datetok_table[(unsigned char) *end++];
 232
 233                 if (end != start) {
 234                         token = date_token_new ();
 235                         token->next = NULL;
 236                         token->start = start;
 237                         token->len = end - start;
 238                         token->mask = mask;
 239
 240                         tail->next = token;
 241                         tail = token;
 242                 }
 243
 244                 if (*end)
 245                         start = end + 1;
 246                 else
 247                         break;
 248         }
 249
 250         return tokens;
 251 }
 252
 253 static int
 254 decode_int (const char *in, size_t inlen)
 255 {
 256         register const char *inptr;
 257         int sign = 1, val = 0;
 258         const char *inend;
 259
 260         inptr = in;
 261         inend = in + inlen;
 262
 263         if (*inptr == '-') {
 264                 sign = -1;
 265                 inptr++;
 266         } else if (*inptr == '+')
 267                 inptr++;
 268
 269         for ( ; inptr < inend; inptr++) {
 270                 if (!(*inptr >= '0' && *inptr <= '9'))
 271                         return -1;
 272                 else
 273                         val = (val * 10) + (*inptr - '0');
 274         }
 275
 276         val *= sign;
 277
 278         return val;
 279 }
 280
 281 #if 0
 282 static int
 283 get_days_in_month (int month, int year)
 284 {
 285         switch (month) {
 286         case 1:
 287         case 3:
 288         case 5:
 289         case 7:
 290         case 8:
 291         case 10:
 292         case 12:
 293                 return 31;
 294         case 4:
 295         case 6:
 296         case 9:
 297         case 11:
 298                 return 30;
 299         case 2:
 300                 if (g_date_is_leap_year (year))
 301                         return 29;
 302                 else
 303                         return 28;
 304         default:
 305                 return 0;
 306         }
 307 }
 308 #endif
 309
 310 static int
 311 get_wday (const char *in, size_t inlen)
 312 {
 313         int wday;
 314
 315         g_return_val_if_fail (in != NULL, -1);
 316
 317         if (inlen < 3)
 318                 return -1;
 319
 320         for (wday = 0; wday < 7; wday++) {
 321                 if (!g_ascii_strncasecmp (in, tm_days[wday], 3))
 322                         return wday;
 323         }
 324
 325         return -1;  /* unknown week day */
 326 }
 327
 328 static int
 329 get_mday (const char *in, size_t inlen)
 330 {
 331         int mday;
 332
 333         g_return_val_if_fail (in != NULL, -1);
 334
 335         mday = decode_int (in, inlen);
 336
 337         if (mday < 0 || mday > 31)
 338                 mday = -1;
 339
 340         return mday;
 341 }
 342
 343 static int
 344 get_month (const char *in, size_t inlen)
 345 {
 346         int i;
 347
 348         g_return_val_if_fail (in != NULL, -1);
 349
 350         if (inlen < 3)
 351                 return -1;
 352
 353         for (i = 0; i < 12; i++) {
 354                 if (!g_ascii_strncasecmp (in, tm_months[i], 3))
 355                         return i;
 356         }
 357
 358         return -1;  /* unknown month */
 359 }
 360
 361 static int
 362 get_year (const char *in, size_t inlen)
 363 {
 364         int year;
 365
 366         g_return_val_if_fail (in != NULL, -1);
 367
 368         if ((year = decode_int (in, inlen)) == -1)
 369                 return -1;
 370
 371         if (year < 100)
 372                 year += (year < 70) ? 2000 : 1900;
 373
 374         if (year < 1969)
 375                 return -1;
 376
 377         return year;
 378 }
 379
 380 static gboolean
 381 get_time (const char *in, size_t inlen, int *hour, int *min, int *sec)
 382 {
 383         register const char *inptr;
 384         int *val, colons = 0;
 385         const char *inend;
 386
 387         *hour = *min = *sec = 0;
 388
 389         inend = in + inlen;
 390         val = hour;
 391         for (inptr = in; inptr < inend; inptr++) {
 392                 if (*inptr == ':') {
 393                         colons++;
 394                         switch (colons) {
 395                         case 1:
 396                                 val = min;
 397                                 break;
 398                         case 2:
 399                                 val = sec;
 400                                 break;
 401                         default:
 402                                 return FALSE;
 403                         }
 404                 } else if (!(*inptr >= '0' && *inptr <= '9'))
 405                         return FALSE;
 406                 else
 407                         *val = (*val * 10) + (*inptr - '0');
 408         }
 409
 410         return TRUE;
 411 }
 412
 413 static int
 414 get_tzone (date_token **token)
 415 {
 416         const char *inptr, *inend;
 417         size_t inlen;
 418         int i, t;
 419
 420         for (i = 0; *token && i < 2; *token = (*token)->next, i++) {
 421                 inptr = (*token)->start;
 422                 inlen = (*token)->len;
 423                 inend = inptr + inlen;
 424
 425                 if (*inptr == '+' || *inptr == '-') {
 426                         return decode_int (inptr, inlen);
 427                 } else {
 428                         if (*inptr == '(') {
 429                                 inptr++;
 430                                 if (*(inend - 1) == ')')
 431                                         inlen -= 2;
 432                                 else
 433                                         inlen--;
 434                         }
 435
 436                         for (t = 0; t < 15; t++) {
 437                                 size_t len = strlen (tz_offsets[t].name);
 438
 439                                 if (len != inlen)
 440                                         continue;
 441
 442                                 if (!strncmp (inptr, tz_offsets[t].name, len))
 443                                         return tz_offsets[t].offset;
 444                         }
 445                 }
 446         }
 447
 448         return -1;
 449 }
 450
 451 static time_t
 452 mktime_utc (struct tm *tm)
 453 {
 454         time_t tt;
 455         long tz;
 456
 457         tm->tm_isdst = -1;
 458         tt = mktime (tm);
 459
 460 #if defined (G_OS_WIN32)
 461         _get_timezone (&tz);
 462         if (tm->tm_isdst > 0) {
 463                 int dst;
 464
 465                 _get_dstbias (&dst);
 466                 tz += dst;
 467         }
 468 #elif defined (HAVE_TM_GMTOFF)
 469         tz = -tm->tm_gmtoff;
 470 #elif defined (HAVE_TIMEZONE)
 471         if (tm->tm_isdst > 0) {
 472 #if defined (HAVE_ALTZONE)
 473                 tz = altzone;
 474 #else /* !defined (HAVE_ALTZONE) */
 475                 tz = (timezone - 3600);
 476 #endif
 477         } else {
 478                 tz = timezone;
 479         }
 480 #elif defined (HAVE__TIMEZONE)
 481         tz = _timezone;
 482 #else
 483 #error Neither HAVE_TIMEZONE nor HAVE_TM_GMTOFF defined. Rerun autoheader, autoconf, etc.
 484 #endif
 485
 486         return tt - tz;
 487 }
 488
 489 static time_t
 490 parse_rfc822_date (date_token *tokens, int *tzone)
 491 {
 492         int hour, min, sec, offset, n;
 493         date_token *token;
 494         struct tm tm;
 495         time_t t;
 496
 497         g_return_val_if_fail (tokens != NULL, (time_t) 0);
 498
 499         token = tokens;
 500
 501         memset ((void *) &tm, 0, sizeof (struct tm));
 502
 503         if ((n = get_wday (token->start, token->len)) != -1) {
 504                 /* not all dates may have this... */
 505                 tm.tm_wday = n;
 506                 token = token->next;
 507         }
 508
 509         /* get the mday */
 510         if (!token || (n = get_mday (token->start, token->len)) == -1)
 511                 return (time_t) 0;
 512
 513         tm.tm_mday = n;
 514         token = token->next;
 515
 516         /* get the month */
 517         if (!token || (n = get_month (token->start, token->len)) == -1)
 518                 return (time_t) 0;
 519
 520         tm.tm_mon = n;
 521         token = token->next;
 522
 523         /* get the year */
 524         if (!token || (n = get_year (token->start, token->len)) == -1)
 525                 return (time_t) 0;
 526
 527         tm.tm_year = n - 1900;
 528         token = token->next;
 529
 530         /* get the hour/min/sec */
 531         if (!token || !get_time (token->start, token->len, &hour, &min, &sec))
 532                 return (time_t) 0;
 533
 534         tm.tm_hour = hour;
 535         tm.tm_min = min;
 536         tm.tm_sec = sec;
 537         token = token->next;
 538
 539         /* get the timezone */
 540         if (!token || (n = get_tzone (&token)) == -1) {
 541                 /* I guess we assume tz is GMT? */
 542                 offset = 0;
 543         } else {
 544                 offset = n;
 545         }
 546
 547         t = mktime_utc (&tm);
 548
 549         /* t is now GMT of the time we want, but not offset by the timezone ... */
 550
 551         /* this should convert the time to the GMT equiv time */
 552         t -= ((offset / 100) * 60 * 60) + (offset % 100) * 60;
 553
 554         if (tzone)
 555                 *tzone = offset;
 556
 557         return t;
 558 }
 559
 560
 561 #define date_token_mask(t)  (((date_token *) t)->mask)
 562 #define is_numeric(t)       ((date_token_mask (t) & DATE_TOKEN_NON_NUMERIC) == 0)
 563 #define is_weekday(t)       ((date_token_mask (t) & DATE_TOKEN_NON_WEEKDAY) == 0)
 564 #define is_month(t)         ((date_token_mask (t) & DATE_TOKEN_NON_MONTH) == 0)
 565 #define is_time(t)          (((date_token_mask (t) & DATE_TOKEN_NON_TIME) == 0) && (date_token_mask (t) & DATE_TOKEN_HAS_COLON))
 566 #define is_tzone_alpha(t)   ((date_token_mask (t) & DATE_TOKEN_NON_TIMEZONE_ALPHA) == 0)
 567 #define is_tzone_numeric(t) (((date_token_mask (t) & DATE_TOKEN_NON_TIMEZONE_NUMERIC) == 0) && (date_token_mask (t) & DATE_TOKEN_HAS_SIGN))
 568 #define is_tzone(t)         (is_tzone_alpha (t) || is_tzone_numeric (t))
 569
 570 static time_t
 571 parse_broken_date (date_token *tokens, int *tzone)
 572 {
 573         gboolean got_wday, got_month, got_tzone;
 574         int hour, min, sec, offset, n;
 575         date_token *token;
 576         struct tm tm;
 577         time_t t;
 578
 579         memset ((void *) &tm, 0, sizeof (struct tm));
 580         got_wday = got_month = got_tzone = FALSE;
 581         offset = 0;
 582
 583         token = tokens;
 584         while (token) {
 585                 if (is_weekday (token) && !got_wday) {
 586                         if ((n = get_wday (token->start, token->len)) != -1) {
 587                                 d(printf ("weekday; "));
 588                                 got_wday = TRUE;
 589                                 tm.tm_wday = n;
 590                                 goto next;
 591                         }
 592                 }
 593
 594                 if (is_month (token) && !got_month) {
 595                         if ((n = get_month (token->start, token->len)) != -1) {
 596                                 d(printf ("month; "));
 597                                 got_month = TRUE;
 598                                 tm.tm_mon = n;
 599                                 goto next;
 600                         }
 601                 }
 602
 603                 if (is_time (token) && !tm.tm_hour && !tm.tm_min && !tm.tm_sec) {
 604                         if (get_time (token->start, token->len, &hour, &min, &sec)) {
 605                                 d(printf ("time; "));
 606                                 tm.tm_hour = hour;
 607                                 tm.tm_min = min;
 608                                 tm.tm_sec = sec;
 609                                 goto next;
 610                         }
 611                 }
 612
 613                 if (is_tzone (token) && !got_tzone) {
 614                         date_token *t = token;
 615
 616                         if ((n = get_tzone (&t)) != -1) {
 617                                 d(printf ("tzone; "));
 618                                 got_tzone = TRUE;
 619                                 offset = n;
 620                                 goto next;
 621                         }
 622                 }
 623
 624                 if (is_numeric (token)) {
 625                         if (token->len == 4 && !tm.tm_year) {
 626                                 if ((n = get_year (token->start, token->len)) != -1) {
 627                                         d(printf ("year; "));
 628                                         tm.tm_year = n - 1900;
 629                                         goto next;
 630                                 }
 631                         } else {
 632                                 /* Note: assumes MM-DD-YY ordering if '0 < MM < 12' holds true */
 633                                 if (!got_month && token->next && is_numeric (token->next)) {
 634                                         if ((n = decode_int (token->start, token->len)) > 12) {
 635                                                 goto mday;
 636                                         } else if (n > 0) {
 637                                                 d(printf ("mon; "));
 638                                                 got_month = TRUE;
 639                                                 tm.tm_mon = n - 1;
 640                                         }
 641                                         goto next;
 642                                 } else if (!tm.tm_mday && (n = get_mday (token->start, token->len)) != -1) {
 643                                 mday:
 644                                         d(printf ("mday; "));
 645                                         tm.tm_mday = n;
 646                                         goto next;
 647                                 } else if (!tm.tm_year) {
 648                                         if ((n = get_year (token->start, token->len)) != -1) {
 649                                                 d(printf ("2-digit year; "));
 650                                                 tm.tm_year = n - 1900;
 651                                         }
 652                                         goto next;
 653                                 }
 654                         }
 655                 }
 656
 657                 d(printf ("???; "));
 658
 659         next:
 660
 661                 token = token->next;
 662         }
 663
 664         d(printf ("\n"));
 665
 666         t = mktime_utc (&tm);
 667
 668         /* t is now GMT of the time we want, but not offset by the timezone ... */
 669
 670         /* this should convert the time to the GMT equiv time */
 671         t -= ((offset / 100) * 60 * 60) + (offset % 100) * 60;
 672
 673         if (tzone)
 674                 *tzone = offset;
 675
 676         return t;
 677 }
 678
 679 #if 0
 680 static void
 681 gmime_datetok_table_init (void)
 682 {
 683         int i;
 684
 685         memset (gmime_datetok_table, 0, sizeof (gmime_datetok_table));
 686
 687         for (i = 0; i < 256; i++) {
 688                 if (!strchr (NUMERIC_CHARS, i))
 689                         gmime_datetok_table[i] |= DATE_TOKEN_NON_NUMERIC;
 690
 691                 if (!strchr (WEEKDAY_CHARS, i))
 692                         gmime_datetok_table[i] |= DATE_TOKEN_NON_WEEKDAY;
 693
 694                 if (!strchr (MONTH_CHARS, i))
 695                         gmime_datetok_table[i] |= DATE_TOKEN_NON_MONTH;
 696
 697                 if (!strchr (TIME_CHARS, i))
 698                         gmime_datetok_table[i] |= DATE_TOKEN_NON_TIME;
 699
 700                 if (!strchr (TIMEZONE_ALPHA_CHARS, i))
 701                         gmime_datetok_table[i] |= DATE_TOKEN_NON_TIMEZONE_ALPHA;
 702
 703                 if (!strchr (TIMEZONE_NUMERIC_CHARS, i))
 704                         gmime_datetok_table[i] |= DATE_TOKEN_NON_TIMEZONE_NUMERIC;
 705
 706                 if (((char) i) == ':')
 707                         gmime_datetok_table[i] |= DATE_TOKEN_HAS_COLON;
 708
 709                 if (strchr ("+-", i))
 710                         gmime_datetok_table[i] |= DATE_TOKEN_HAS_SIGN;
 711         }
 712
 713         printf ("static unsigned char gmime_datetok_table[256] = {");
 714         for (i = 0; i < 256; i++) {
 715                 if (i % 16 == 0)
 716                         printf ("\n\t");
 717                 printf ("%3d,", gmime_datetok_table[i]);
 718         }
 719         printf ("\n};\n");
 720 }
 721 #endif
 722
 723
 724 /**
 725  * g_mime_utils_header_decode_date:
 726  * @str: input date string
 727  * @tz_offset: timezone offset
 728  *
 729  * Decodes the rfc822 date string and saves the GMT offset into
 730  * @tz_offset if non-NULL.
 731  *
 732  * Returns: the time_t representation of the date string specified by
 733  * @str or (time_t) %0 on error. If @tz_offset is non-NULL, the value
 734  * of the timezone offset will be stored.
 735  **/
 736 time_t
 737 g_mime_utils_header_decode_date (const char *str, int *tz_offset)
 738 {
 739         date_token *token, *tokens;
 740         time_t date;
 741
 742         if (!(tokens = datetok (str))) {
 743                 if (tz_offset)
 744                         *tz_offset = 0;
 745
 746                 return (time_t) 0;
 747         }
 748
 749         if (!(date = parse_rfc822_date (tokens, tz_offset)))
 750                 date = parse_broken_date (tokens, tz_offset);
 751
 752         /* cleanup */
 753         while (tokens) {
 754                 token = tokens;
 755                 tokens = tokens->next;
 756                 date_token_free (token);
 757         }
 758
 759         return date;
 760 }
 761
 762
 763 /**
 764  * g_mime_utils_generate_message_id:
 765  * @fqdn: Fully qualified domain name
 766  *
 767  * Generates a unique Message-Id.
 768  *
 769  * Returns: a unique string in an addr-spec format suitable for use as
 770  * a Message-Id.
 771  **/
 772 char *
 773 g_mime_utils_generate_message_id (const char *fqdn)
 774 {
 775 #ifdef G_THREADS_ENABLED
 776         static GStaticMutex mutex = G_STATIC_MUTEX_INIT;
 777 #define MUTEX_LOCK()   g_static_mutex_lock (&mutex)
 778 #define MUTEX_UNLOCK() g_static_mutex_unlock (&mutex)
 779 #else
 780 #define MUTEX_LOCK()
 781 #define MUTEX_UNLOCK()
 782 #endif
 783         static unsigned long int count = 0;
 784         const char *hostname = NULL;
 785         char *name = NULL;
 786         char *msgid;
 787
 788         if (!fqdn) {
 789 #ifdef HAVE_UTSNAME_DOMAINNAME
 790                 struct utsname unam;
 791
 792                 uname (&unam);
 793
 794                 hostname = unam.nodename;
 795
 796                 if (unam.domainname[0])
 797                         name = g_strdup_printf ("%s.%s", hostname, unam.domainname);
 798 #else /* ! HAVE_UTSNAME_DOMAINNAME */
 799                 char host[MAXHOSTNAMELEN + 1];
 800
 801 #ifdef HAVE_GETHOSTNAME
 802                 host[MAXHOSTNAMELEN] = '\0';
 803                 if (gethostname (host, MAXHOSTNAMELEN) == 0) {
 804 #ifdef HAVE_GETDOMAINNAME
 805                         size_t domainlen = MAXHOSTNAMELEN;
 806                         char *domain;
 807                         int rv;
 808
 809                         domain = g_malloc (domainlen);
 810
 811                         while ((rv = getdomainname (domain, domainlen)) == -1 && errno == EINVAL) {
 812                                 domainlen += MAXHOSTNAMELEN;
 813                                 domain = g_realloc (domain, domainlen);
 814                         }
 815
 816                         if (rv == 0 && domain[0]) {
 817                                 if (host[0]) {
 818                                         name = g_strdup_printf ("%s.%s", host, domain);
 819                                         g_free (domain);
 820                                 } else {
 821                                         name = domain;
 822                                 }
 823                         }
 824 #endif /* HAVE_GETDOMAINNAME */
 825                 } else {
 826                         host[0] = '\0';
 827                 }
 828 #endif /* HAVE_GETHOSTNAME */
 829                 hostname = host;
 830 #endif /* HAVE_UTSNAME_DOMAINNAME */
 831
 832 #ifdef HAVE_GETADDRINFO
 833                 if (!name && hostname[0]) {
 834                         /* we weren't able to get a domain name */
 835                         struct addrinfo hints, *res;
 836
 837                         memset (&hints, 0, sizeof (hints));
 838                         hints.ai_flags = AI_CANONNAME;
 839
 840                         if (getaddrinfo (hostname, NULL, &hints, &res) == 0) {
 841                                 name = g_strdup (res->ai_canonname);
 842                                 freeaddrinfo (res);
 843                         }
 844                 }
 845 #endif /* HAVE_GETADDRINFO */
 846
 847                 fqdn = name != NULL ? name : (hostname[0] ? hostname : "localhost.localdomain");
 848         }
 849
 850         MUTEX_LOCK ();
 851         msgid = g_strdup_printf ("%lu.%lu.%lu@%s", (unsigned long int) time (NULL),
 852                                  (unsigned long int) getpid (), count++, fqdn);
 853         MUTEX_UNLOCK ();
 854
 855         g_free (name);
 856
 857         return msgid;
 858 }
 859
 860 static char *
 861 decode_addrspec (const char **in)
 862 {
 863         const char *word, *inptr;
 864         GString *addrspec;
 865         char *str;
 866
 867         decode_lwsp (in);
 868         inptr = *in;
 869
 870         if (!(word = decode_word (&inptr))) {
 871                 w(g_warning ("No local-part in addr-spec: %s", *in));
 872                 return NULL;
 873         }
 874
 875         addrspec = g_string_new ("");
 876         g_string_append_len (addrspec, word, (size_t) (inptr - word));
 877
 878         /* get the rest of the local-part */
 879         decode_lwsp (&inptr);
 880         while (*inptr == '.') {
 881                 g_string_append_c (addrspec, *inptr++);
 882                 if ((word = decode_word (&inptr))) {
 883                         g_string_append_len (addrspec, word, (size_t) (inptr - word));
 884                         decode_lwsp (&inptr);
 885                 } else {
 886                         w(g_warning ("Invalid local-part in addr-spec: %s", *in));
 887                         goto exception;
 888                 }
 889         }
 890
 891         /* we should be at the '@' now... */
 892         if (*inptr++ != '@') {
 893                 w(g_warning ("Invalid addr-spec; missing '@': %s", *in));
 894                 goto exception;
 895         }
 896
 897         g_string_append_c (addrspec, '@');
 898         if (!decode_domain (&inptr, addrspec)) {
 899                 w(g_warning ("No domain in addr-spec: %s", *in));
 900                 goto exception;
 901         }
 902
 903         str = addrspec->str;
 904         g_string_free (addrspec, FALSE);
 905
 906         *in = inptr;
 907
 908         return str;
 909
 910  exception:
 911
 912         g_string_free (addrspec, TRUE);
 913
 914         return NULL;
 915 }
 916
 917 static char *
 918 decode_msgid (const char **in)
 919 {
 920         const char *inptr = *in;
 921         char *msgid = NULL;
 922
 923         decode_lwsp (&inptr);
 924         if (*inptr != '<') {
 925                 w(g_warning ("Invalid msg-id; missing '<': %s", *in));
 926         } else {
 927                 inptr++;
 928         }
 929
 930         decode_lwsp (&inptr);
 931         if ((msgid = decode_addrspec (&inptr))) {
 932                 decode_lwsp (&inptr);
 933                 if (*inptr != '>') {
 934                         w(g_warning ("Invalid msg-id; missing '>': %s", *in));
 935                 } else {
 936                         inptr++;
 937                 }
 938
 939                 *in = inptr;
 940         } else {
 941                 w(g_warning ("Invalid msg-id; missing addr-spec: %s", *in));
 942                 *in = inptr;
 943                 while (*inptr && *inptr != '>')
 944                         inptr++;
 945
 946                 msgid = g_strndup (*in, (size_t) (inptr - *in));
 947                 *in = inptr;
 948         }
 949
 950         return msgid;
 951 }
 952
 953
 954 /**
 955  * g_mime_utils_decode_message_id:
 956  * @message_id: string containing a message-id
 957  *
 958  * Decodes a msg-id as defined by rfc822.
 959  *
 960  * Returns: the addr-spec portion of the msg-id.
 961  **/
 962 char *
 963 g_mime_utils_decode_message_id (const char *message_id)
 964 {
 965         g_return_val_if_fail (message_id != NULL, NULL);
 966
 967         return decode_msgid (&message_id);
 968 }
 969
 970
 971 /**
 972  * g_mime_references_decode:
 973  * @text: string containing a list of msg-ids
 974  *
 975  * Decodes a list of msg-ids as in the References and/or In-Reply-To
 976  * headers defined in rfc822.
 977  *
 978  * Returns: a list of referenced msg-ids.
 979  **/
 980 GMimeReferences *
 981 g_mime_references_decode (const char *text)
 982 {
 983         GMimeReferences *refs, *tail, *ref;
 984         const char *word, *inptr = text;
 985         char *msgid;
 986
 987         g_return_val_if_fail (text != NULL, NULL);
 988
 989         refs = NULL;
 990         tail = (GMimeReferences *) &refs;
 991
 992         while (*inptr) {
 993                 decode_lwsp (&inptr);
 994                 if (*inptr == '<') {
 995                         /* looks like a msg-id */
 996                         if ((msgid = decode_msgid (&inptr))) {
 997                                 ref = g_new (GMimeReferences, 1);
 998                                 ref->next = NULL;
 999                                 ref->msgid = msgid;
1000                                 tail->next = ref;
1001                                 tail = ref;
1002                         } else {
1003                                 w(g_warning ("Invalid References header: %s", inptr));
1004                                 break;
1005                         }
1006                 } else if (*inptr) {
1007                         /* looks like part of a phrase */
1008                         if (!(word = decode_word (&inptr))) {
1009                                 w(g_warning ("Invalid References header: %s", inptr));
1010                                 break;
1011                         }
1012                 }
1013         }
1014
1015         return refs;
1016 }
1017
1018
1019 /**
1020  * g_mime_references_append:
1021  * @refs: the address of a #GMimeReferences list
1022  * @msgid: a message-id string
1023  *
1024  * Appends a reference to msgid to the list of references.
1025  **/
1026 void
1027 g_mime_references_append (GMimeReferences **refs, const char *msgid)
1028 {
1029         GMimeReferences *ref;
1030
1031         g_return_if_fail (refs != NULL);
1032         g_return_if_fail (msgid != NULL);
1033
1034         ref = (GMimeReferences *) refs;
1035         while (ref->next)
1036                 ref = ref->next;
1037
1038         ref->next = g_new (GMimeReferences, 1);
1039         ref->next->msgid = g_strdup (msgid);
1040         ref->next->next = NULL;
1041 }
1042
1043
1044 /**
1045  * g_mime_references_free:
1046  * @refs: a #GMimeReferences list
1047  *
1048  * Frees the #GMimeReferences list.
1049  **/
1050 void
1051 g_mime_references_free (GMimeReferences *refs)
1052 {
1053         GMimeReferences *ref, *next;
1054
1055         ref = refs;
1056         while (ref) {
1057                 next = ref->next;
1058                 g_free (ref->msgid);
1059                 g_free (ref);
1060                 ref = next;
1061         }
1062 }
1063
1064
1065 /**
1066  * g_mime_references_clear:
1067  * @refs: address of a #GMimeReferences list
1068  *
1069  * Clears the #GMimeReferences list and resets it to %NULL.
1070  **/
1071 void
1072 g_mime_references_clear (GMimeReferences **refs)
1073 {
1074         g_return_if_fail (refs != NULL);
1075
1076         g_mime_references_free (*refs);
1077         *refs = NULL;
1078 }
1079
1080
1081 /**
1082  * g_mime_references_get_next:
1083  * @ref: a #GMimeReferences list
1084  *
1085  * Advances to the next reference node in the #GMimeReferences list.
1086  *
1087  * Returns: the next reference node in the #GMimeReferences list.
1088  **/
1089 const GMimeReferences *
1090 g_mime_references_get_next (const GMimeReferences *ref)
1091 {
1092         return ref ? ref->next : NULL;
1093 }
1094
1095
1096 /**
1097  * g_mime_references_get_message_id:
1098  * @ref: a #GMimeReferences list
1099  *
1100  * Gets the Message-Id reference from the #GMimeReferences node.
1101  *
1102  * Returns: the Message-Id reference from the #GMimeReferences node.
1103  **/
1104 const char *
1105 g_mime_references_get_message_id (const GMimeReferences *ref)
1106 {
1107         return ref ? ref->msgid : NULL;
1108 }
1109
1110
1111 static gboolean
1112 is_rfc2047_token (const char *inptr, size_t len)
1113 {
1114         if (len < 8 || strncmp (inptr, "=?", 2) != 0 || strncmp (inptr + len - 2, "?=", 2) != 0)
1115                 return FALSE;
1116
1117         inptr += 2;
1118         len -= 2;
1119
1120         /* skip past the charset */
1121         while (*inptr != '?' && len > 0) {
1122                 inptr++;
1123                 len--;
1124         }
1125
1126         if (*inptr != '?' || len < 4)
1127                 return FALSE;
1128
1129         if (inptr[1] != 'q' && inptr[1] != 'Q' && inptr[1] != 'b' && inptr[1] != 'B')
1130                 return FALSE;
1131
1132         inptr += 2;
1133         len -= 2;
1134
1135         if (*inptr != '?')
1136                 return FALSE;
1137
1138         return TRUE;
1139 }
1140
1141 static char *
1142 header_fold (const char *in, gboolean structured)
1143 {
1144         gboolean last_was_lwsp = FALSE;
1145         register const char *inptr;
1146         size_t len, outlen, i;
1147         size_t fieldlen;
1148         GString *out;
1149         char *ret;
1150
1151         inptr = in;
1152         len = strlen (in);
1153         if (len <= GMIME_FOLD_LEN + 1)
1154                 return g_strdup (in);
1155
1156         out = g_string_new ("");
1157         fieldlen = strcspn (inptr, ": \t\n");
1158         g_string_append_len (out, inptr, fieldlen);
1159         outlen = fieldlen;
1160         inptr += fieldlen;
1161
1162         while (*inptr && *inptr != '\n') {
1163                 len = strcspn (inptr, " \t\n");
1164
1165                 if (len > 1 && outlen + len > GMIME_FOLD_LEN) {
1166                         if (outlen > 1 && out->len > fieldlen + 2) {
1167                                 if (last_was_lwsp) {
1168                                         if (structured)
1169                                                 out->str[out->len - 1] = '\t';
1170
1171                                         g_string_insert_c (out, out->len - 1, '\n');
1172                                 } else
1173                                         g_string_append (out, "\n\t");
1174                                 outlen = 1;
1175                         }
1176
1177                         if (!structured && !is_rfc2047_token (inptr, len)) {
1178                                 /* check for very long words, just cut them up */
1179                                 while (outlen + len > GMIME_FOLD_LEN) {
1180                                         for (i = 0; i < GMIME_FOLD_LEN - outlen; i++)
1181                                                 g_string_append_c (out, inptr[i]);
1182                                         inptr += GMIME_FOLD_LEN - outlen;
1183                                         len -= GMIME_FOLD_LEN - outlen;
1184                                         g_string_append (out, "\n\t");
1185                                         outlen = 1;
1186                                 }
1187                         } else {
1188                                 g_string_append_len (out, inptr, len);
1189                                 outlen += len;
1190                                 inptr += len;
1191                         }
1192                         last_was_lwsp = FALSE;
1193                 } else if (len > 0) {
1194                         g_string_append_len (out, inptr, len);
1195                         outlen += len;
1196                         inptr += len;
1197                         last_was_lwsp = FALSE;
1198                 } else {
1199                         last_was_lwsp = TRUE;
1200                         if (*inptr == '\t') {
1201                                 /* tabs are a good place to fold, odds
1202                                    are that this is where the previous
1203                                    mailer folded it */
1204                                 g_string_append (out, "\n\t");
1205                                 outlen = 1;
1206                                 while (is_blank (*inptr))
1207                                         inptr++;
1208                         } else {
1209                                 g_string_append_c (out, *inptr++);
1210                                 outlen++;
1211                         }
1212                 }
1213         }
1214
1215         if (*inptr == '\n' && out->str[out->len - 1] != '\n')
1216                 g_string_append_c (out, '\n');
1217
1218         ret = out->str;
1219         g_string_free (out, FALSE);
1220
1221         return ret;
1222 }
1223
1224
1225 /**
1226  * g_mime_utils_structured_header_fold:
1227  * @str: input string
1228  *
1229  * Folds a structured header according to the rules in rfc822.
1230  *
1231  * Returns: an allocated string containing the folded header.
1232  **/
1233 char *
1234 g_mime_utils_structured_header_fold (const char *str)
1235 {
1236         return header_fold (str, TRUE);
1237 }
1238
1239
1240 /**
1241  * g_mime_utils_unstructured_header_fold:
1242  * @str: input string
1243  *
1244  * Folds an unstructured header according to the rules in rfc822.
1245  *
1246  * Returns: an allocated string containing the folded header.
1247  **/
1248 char *
1249 g_mime_utils_unstructured_header_fold (const char *str)
1250 {
1251         return header_fold (str, FALSE);
1252 }
1253
1254
1255 /**
1256  * g_mime_utils_header_fold:
1257  * @str: input string
1258  *
1259  * Folds a structured header according to the rules in rfc822.
1260  *
1261  * Returns: an allocated string containing the folded header.
1262  **/
1263 char *
1264 g_mime_utils_header_fold (const char *str)
1265 {
1266         return header_fold (str, TRUE);
1267 }
1268
1269
1270 /**
1271  * g_mime_utils_header_printf:
1272  * @format: string format
1273  * @Varargs: arguments
1274  *
1275  * Allocates a buffer containing a formatted header specified by the
1276  * @Varargs.
1277  *
1278  * Returns: an allocated string containing the folded header specified
1279  * by @format and the following arguments.
1280  **/
1281 char *
1282 g_mime_utils_header_printf (const char *format, ...)
1283 {
1284         char *buf, *ret;
1285         va_list ap;
1286
1287         va_start (ap, format);
1288         buf = g_strdup_vprintf (format, ap);
1289         va_end (ap);
1290
1291         ret = header_fold (buf, TRUE);
1292         g_free (buf);
1293
1294         return ret;
1295 }
1296
1297 static gboolean
1298 need_quotes (const char *string)
1299 {
1300         gboolean quoted = FALSE;
1301         const char *inptr;
1302
1303         inptr = string;
1304
1305         while (*inptr) {
1306                 if (*inptr == '\\')
1307                         inptr++;
1308                 else if (*inptr == '"')
1309                         quoted = !quoted;
1310                 else if (!quoted && (is_tspecial (*inptr) || *inptr == '.'))
1311                         return TRUE;
1312
1313                 if (*inptr)
1314                         inptr++;
1315         }
1316
1317         return FALSE;
1318 }
1319
1320 /**
1321  * g_mime_utils_quote_string:
1322  * @str: input string
1323  *
1324  * Quotes @string as needed according to the rules in rfc2045.
1325  *
1326  * Returns: an allocated string containing the escaped and quoted (if
1327  * needed to be) input string. The decision to quote the string is
1328  * based on whether or not the input string contains any 'tspecials'
1329  * as defined by rfc2045.
1330  **/
1331 char *
1332 g_mime_utils_quote_string (const char *str)
1333 {
1334         gboolean quote;
1335         const char *c;
1336         char *qstring;
1337         GString *out;
1338
1339         out = g_string_new ("");
1340
1341         if ((quote = need_quotes (str)))
1342                 g_string_append_c (out, '"');
1343
1344         for (c = str; *c; c++) {
1345                 if ((*c == '"' && quote) || *c == '\\')
1346                         g_string_append_c (out, '\\');
1347
1348                 g_string_append_c (out, *c);
1349         }
1350
1351         if (quote)
1352                 g_string_append_c (out, '"');
1353
1354         qstring = out->str;
1355         g_string_free (out, FALSE);
1356
1357         return qstring;
1358 }
1359
1360
1361 /**
1362  * g_mime_utils_unquote_string:
1363  * @str: input string
1364  *
1365  * Unquotes and unescapes a string.
1366  **/
1367 void
1368 g_mime_utils_unquote_string (char *str)
1369 {
1370         /* if the string is quoted, unquote it */
1371         register char *inptr = str;
1372         int escaped = FALSE;
1373         int quoted = FALSE;
1374
1375         if (!str)
1376                 return;
1377
1378         while (*inptr) {
1379                 if (*inptr == '\\') {
1380                         if (escaped)
1381                                 *str++ = *inptr++;
1382                         else
1383                                 inptr++;
1384                         escaped = !escaped;
1385                 } else if (*inptr == '"') {
1386                         if (escaped) {
1387                                 *str++ = *inptr++;
1388                                 escaped = FALSE;
1389                         } else {
1390                                 quoted = !quoted;
1391                                 inptr++;
1392                         }
1393                 } else {
1394                         *str++ = *inptr++;
1395                         escaped = FALSE;
1396                 }
1397         }
1398
1399         *str = '\0';
1400 }
1401
1402
1403 /**
1404  * g_mime_utils_text_is_8bit:
1405  * @text: text to check for 8bit chars
1406  * @len: text length
1407  *
1408  * Determines if @text contains 8bit characters within the first @len
1409  * bytes.
1410  *
1411  * Returns: %TRUE if the text contains 8bit characters or %FALSE
1412  * otherwise.
1413  **/
1414 gboolean
1415 g_mime_utils_text_is_8bit (const unsigned char *text, size_t len)
1416 {
1417         register const unsigned char *inptr;
1418         const unsigned char *inend;
1419
1420         g_return_val_if_fail (text != NULL, FALSE);
1421
1422         inend = text + len;
1423         for (inptr = text; *inptr && inptr < inend; inptr++)
1424                 if (*inptr > (unsigned char) 127)
1425                         return TRUE;
1426
1427         return FALSE;
1428 }
1429
1430
1431 /**
1432  * g_mime_utils_best_encoding:
1433  * @text: text to encode
1434  * @len: text length
1435  *
1436  * Determines the best content encoding for the first @len bytes of
1437  * @text.
1438  *
1439  * Returns: a #GMimeContentEncoding that is determined to be the best
1440  * encoding type for the specified block of text. ("best" in this
1441  * particular case means smallest output size)
1442  **/
1443 GMimeContentEncoding
1444 g_mime_utils_best_encoding (const unsigned char *text, size_t len)
1445 {
1446         const unsigned char *ch, *inend;
1447         size_t count = 0;
1448
1449         inend = text + len;
1450         for (ch = text; ch < inend; ch++)
1451                 if (*ch > (unsigned char) 127)
1452                         count++;
1453
1454         if ((float) count <= len * 0.17)
1455                 return GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE;
1456         else
1457                 return GMIME_CONTENT_ENCODING_BASE64;
1458 }
1459
1460
1461 /**
1462  * charset_convert:
1463  * @cd: iconv converter
1464  * @inbuf: input text buffer to convert
1465  * @inleft: length of the input buffer
1466  * @outp: pointer to output buffer
1467  * @outlenp: pointer to output buffer length
1468  * @ninval: the number of invalid bytes in @inbuf
1469  *
1470  * Converts the input buffer from one charset to another using the
1471  * @cd. On completion, @outp will point to the output buffer
1472  * containing the converted text (nul-terminated), @outlenp will be
1473  * the size of the @outp buffer (note: not the strlen() of @outp) and
1474  * @ninval will contain the number of bytes which could not be
1475  * converted.
1476  *
1477  * Bytes which cannot be converted from @inbuf will appear as '?'
1478  * characters in the output buffer.
1479  *
1480  * If *@outp is non-NULL, then it is assumed that it points to a
1481  * pre-allocated buffer of length *@outlenp. This is done so that the
1482  * same output buffer can be reused multiple times.
1483  *
1484  * Returns: the string length of the output buffer.
1485  **/
1486 static size_t
1487 charset_convert (iconv_t cd, const char *inbuf, size_t inleft, char **outp, size_t *outlenp, size_t *ninval)
1488 {
1489         size_t outlen, outleft, rc, n = 0;
1490         char *outbuf, *out;
1491
1492         if (*outp == NULL) {
1493                 outleft = outlen = (inleft * 2) + 16;
1494                 outbuf = out = g_malloc (outlen + 1);
1495         } else {
1496                 outleft = outlen = *outlenp;
1497                 outbuf = out = *outp;
1498         }
1499
1500         do {
1501                 rc = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft);
1502                 if (rc == (size_t) -1) {
1503                         if (errno == EINVAL) {
1504                                 /* incomplete sequence at the end of the input buffer */
1505                                 n += inleft;
1506                                 break;
1507                         }
1508
1509 #ifdef G_OS_WIN32
1510                         /* seems that GnuWin32's libiconv 1.9 does not set errno in
1511                          * the E2BIG case, so we have to fake it */
1512                         if (outleft <= inleft)
1513                                 errno = E2BIG;
1514 #endif
1515
1516                         if (errno == E2BIG) {
1517                                 /* need to grow the output buffer */
1518                                 outlen += (inleft * 2) + 16;
1519                                 rc = (size_t) (outbuf - out);
1520                                 out = g_realloc (out, outlen + 1);
1521                                 outleft = outlen - rc;
1522                                 outbuf = out + rc;
1523                         } else {
1524                                 /* invalid byte(-sequence) in the input buffer */
1525                                 *outbuf++ = '?';
1526                                 outleft--;
1527                                 inleft--;
1528                                 inbuf++;
1529                                 n++;
1530                         }
1531                 }
1532         } while (inleft > 0);
1533
1534         iconv (cd, NULL, NULL, &outbuf, &outleft);
1535         *outbuf++ = '\0';
1536
1537         *outlenp = outlen;
1538         *outp = out;
1539         *ninval = n;
1540
1541         return (outbuf - out);
1542 }
1543
1544
1545 #define USER_CHARSETS_INCLUDE_UTF8    (1 << 0)
1546 #define USER_CHARSETS_INCLUDE_LOCALE  (1 << 1)
1547
1548
1549 /**
1550  * g_mime_utils_decode_8bit:
1551  * @text: input text in unknown 8bit/multibyte character set
1552  * @len: input text length
1553  *
1554  * Attempts to convert text in an unknown 8bit/multibyte charset into
1555  * UTF-8 by finding the charset which will convert the most bytes into
1556  * valid UTF-8 characters as possible. If no exact match can be found,
1557  * it will choose the best match and convert invalid byte sequences
1558  * into question-marks (?) in the returned string buffer.
1559  *
1560  * Returns: a UTF-8 string representation of @text.
1561  **/
1562 char *
1563 g_mime_utils_decode_8bit (const char *text, size_t len)
1564 {
1565         const char **charsets, **user_charsets, *locale, *best;
1566         size_t outleft, outlen, min, ninval;
1567         unsigned int included = 0;
1568         iconv_t cd;
1569         char *out;
1570         int i = 0;
1571
1572         g_return_val_if_fail (text != NULL, NULL);
1573
1574         locale = g_mime_locale_charset ();
1575         if (locale && !g_ascii_strcasecmp (locale, "UTF-8"))
1576                 included |= USER_CHARSETS_INCLUDE_LOCALE;
1577
1578         if ((user_charsets = g_mime_user_charsets ())) {
1579                 while (user_charsets[i])
1580                         i++;
1581         }
1582
1583         charsets = g_alloca (sizeof (char *) * (i + 3));
1584         i = 0;
1585
1586         if (user_charsets) {
1587                 while (user_charsets[i]) {
1588                         /* keep a record of whether or not the user-supplied
1589                          * charsets include UTF-8 and/or the default fallback
1590                          * charset so that we avoid doubling our efforts for
1591                          * these 2 charsets. We could have used a hash table
1592                          * to keep track of unique charsets, but we can
1593                          * (hopefully) assume that user_charsets is a unique
1594                          * list of charsets with no duplicates. */
1595                         if (!g_ascii_strcasecmp (user_charsets[i], "UTF-8"))
1596                                 included |= USER_CHARSETS_INCLUDE_UTF8;
1597
1598                         if (locale && !g_ascii_strcasecmp (user_charsets[i], locale))
1599                                 included |= USER_CHARSETS_INCLUDE_LOCALE;
1600
1601                         charsets[i] = user_charsets[i];
1602                         i++;
1603                 }
1604         }
1605
1606         if (!(included & USER_CHARSETS_INCLUDE_UTF8))
1607                 charsets[i++] = "UTF-8";
1608
1609         if (!(included & USER_CHARSETS_INCLUDE_LOCALE))
1610                 charsets[i++] = locale;
1611
1612         charsets[i] = NULL;
1613
1614         min = len;
1615         best = charsets[0];
1616
1617         outleft = (len * 2) + 16;
1618         out = g_malloc (outleft + 1);
1619
1620         for (i = 0; charsets[i]; i++) {
1621                 if ((cd = g_mime_iconv_open ("UTF-8", charsets[i])) == (iconv_t) -1)
1622                         continue;
1623
1624                 outlen = charset_convert (cd, text, len, &out, &outleft, &ninval);
1625
1626                 g_mime_iconv_close (cd);
1627
1628                 if (ninval == 0)
1629                         return g_realloc (out, outlen + 1);
1630
1631                 if (ninval < min) {
1632                         best = charsets[i];
1633                         min = ninval;
1634                 }
1635         }
1636
1637         /* if we get here, then none of the charsets fit the 8bit text flawlessly...
1638          * try to find the one that fit the best and use that to convert what we can,
1639          * replacing any byte we can't convert with a '?' */
1640
1641         if ((cd = g_mime_iconv_open ("UTF-8", best)) == (iconv_t) -1) {
1642                 /* this shouldn't happen... but if we are here, then
1643                  * it did...  the only thing we can do at this point
1644                  * is replace the 8bit garbage and pray */
1645                 register const char *inptr = text;
1646                 const char *inend = inptr + len;
1647                 char *outbuf = out;
1648
1649                 while (inptr < inend) {
1650                         if (is_ascii (*inptr))
1651                                 *outbuf++ = *inptr++;
1652                         else
1653                                 *outbuf++ = '?';
1654                 }
1655
1656                 *outbuf++ = '\0';
1657
1658                 return g_realloc (out, (size_t) (outbuf - out));
1659         }
1660
1661         outlen = charset_convert (cd, text, len, &out, &outleft, &ninval);
1662
1663         g_mime_iconv_close (cd);
1664
1665         return g_realloc (out, outlen + 1);
1666 }
1667
1668
1669 /* this decodes rfc2047's version of quoted-printable */
1670 static ssize_t
1671 quoted_decode (const unsigned char *in, size_t len, unsigned char *out)
1672 {
1673         register const unsigned char *inptr;
1674         register unsigned char *outptr;
1675         const unsigned char *inend;
1676         unsigned char c, c1;
1677
1678         inend = in + len;
1679         outptr = out;
1680
1681         inptr = in;
1682         while (inptr < inend) {
1683                 c = *inptr++;
1684                 if (c == '=') {
1685                         if (inend - inptr >= 2) {
1686                                 c = toupper (*inptr++);
1687                                 c1 = toupper (*inptr++);
1688                                 *outptr++ = (((c >= 'A' ? c - 'A' + 10 : c - '0') & 0x0f) << 4)
1689                                         | ((c1 >= 'A' ? c1 - 'A' + 10 : c1 - '0') & 0x0f);
1690                         } else {
1691                                 /* data was truncated */
1692                                 return -1;
1693                         }
1694                 } else if (c == '_') {
1695                         /* _'s are an rfc2047 shortcut for encoding spaces */
1696                         *outptr++ = ' ';
1697                 } else {
1698                         *outptr++ = c;
1699                 }
1700         }
1701
1702         return (ssize_t) (outptr - out);
1703 }
1704
1705 #define is_rfc2047_encoded_word(atom, len) (len >= 7 && !strncmp (atom, "=?", 2) && !strncmp (atom + len - 2, "?=", 2))
1706
1707 static char *
1708 rfc2047_decode_word (const char *in, size_t inlen)
1709 {
1710         const unsigned char *instart = (const unsigned char *) in;
1711         const register unsigned char *inptr = instart + 2;
1712         const unsigned char *inend = instart + inlen - 2;
1713         unsigned char *decoded;
1714         const char *charset;
1715         size_t len, ninval;
1716         char *charenc, *p;
1717         guint32 save = 0;
1718         ssize_t declen;
1719         int state = 0;
1720         iconv_t cd;
1721         char *buf;
1722
1723         /* skip over the charset */
1724         if (!(inptr = memchr (inptr, '?', inend - inptr)) || inptr[2] != '?')
1725                 return NULL;
1726
1727         inptr++;
1728
1729         switch (*inptr) {
1730         case 'B':
1731         case 'b':
1732                 inptr += 2;
1733                 len = (size_t) (inend - inptr);
1734                 decoded = g_alloca (len);
1735                 declen = g_mime_encoding_base64_decode_step (inptr, len, decoded, &state, &save);
1736
1737                 if (declen == -1) {
1738                         d(fprintf (stderr, "encountered broken 'Q' encoding\n"));
1739                         return NULL;
1740                 }
1741                 break;
1742         case 'Q':
1743         case 'q':
1744                 inptr += 2;
1745                 len = (size_t) (inend - inptr);
1746                 decoded = g_alloca (len);
1747                 declen = quoted_decode (inptr, len, decoded);
1748
1749                 if (declen == -1) {
1750                         d(fprintf (stderr, "encountered broken 'Q' encoding\n"));
1751                         return NULL;
1752                 }
1753                 break;
1754         default:
1755                 d(fprintf (stderr, "unknown encoding\n"));
1756                 return NULL;
1757         }
1758
1759         len = (inptr - 3) - (instart + 2);
1760         charenc = g_alloca (len + 1);
1761         memcpy (charenc, in + 2, len);
1762         charenc[len] = '\0';
1763         charset = charenc;
1764
1765         /* rfc2231 updates rfc2047 encoded words...
1766          * The ABNF given in RFC 2047 for encoded-words is:
1767          *   encoded-word := "=?" charset "?" encoding "?" encoded-text "?="
1768          * This specification changes this ABNF to:
1769          *   encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?="
1770          */
1771
1772         /* trim off the 'language' part if it's there... */
1773         if ((p = strchr (charset, '*')))
1774                 *p = '\0';
1775
1776         /* slight optimization? */
1777         if (!g_ascii_strcasecmp (charset, "UTF-8")) {
1778                 p = (char *) decoded;
1779                 len = declen;
1780
1781                 //while (!g_utf8_validate (p, len, (const char **) &p)) {
1782                 //      len = declen - (p - (char *) decoded);
1783                 //      *p = '?';
1784                 //}
1785
1786                 return g_strndup ((char *) decoded, declen);
1787         }
1788
1789         if (!charset[0] || (cd = g_mime_iconv_open ("UTF-8", charset)) == (iconv_t) -1) {
1790                 w(g_warning ("Cannot convert from %s to UTF-8, header display may "
1791                              "be corrupt: %s", charset[0] ? charset : "unspecified charset",
1792                              g_strerror (errno)));
1793
1794                 return g_mime_utils_decode_8bit ((char *) decoded, declen);
1795         }
1796
1797         len = declen;
1798         buf = g_malloc (len + 1);
1799
1800         charset_convert (cd, (char *) decoded, declen, &buf, &len, &ninval);
1801
1802         g_mime_iconv_close (cd);
1803
1804 #if w(!)0
1805         if (ninval > 0) {
1806                 g_warning ("Failed to completely convert \"%.*s\" to UTF-8, display may be "
1807                            "corrupt: %s", declen, decoded, g_strerror (errno));
1808         }
1809 #endif
1810
1811         return buf;
1812 }
1813
1814
1815 /**
1816  * g_mime_utils_header_decode_text:
1817  * @text: header text to decode
1818  *
1819  * Decodes an rfc2047 encoded 'text' header.
1820  *
1821  * Note: See g_mime_set_user_charsets() for details on how charset
1822  * conversion is handled for unencoded 8bit text and/or wrongly
1823  * specified rfc2047 encoded-word tokens.
1824  *
1825  * Returns: a newly allocated UTF-8 string representing the the decoded
1826  * header.
1827  **/
1828 char *
1829 g_mime_utils_header_decode_text (const char *text)
1830 {
1831         gboolean enable_rfc2047_workarounds = _g_mime_enable_rfc2047_workarounds ();
1832         register const char *inptr = text;
1833         gboolean encoded = FALSE;
1834         const char *lwsp, *word;
1835         size_t nlwsp, n;
1836         gboolean ascii;
1837         char *decoded;
1838         GString *out;
1839
1840         if (text == NULL)
1841                 return g_strdup ("");
1842
1843         out = g_string_sized_new (strlen (text) + 1);
1844
1845         while (*inptr != '\0') {
1846                 lwsp = inptr;
1847                 while (is_lwsp (*inptr))
1848                         inptr++;
1849
1850                 nlwsp = (size_t) (inptr - lwsp);
1851
1852                 if (*inptr != '\0') {
1853                         word = inptr;
1854                         ascii = TRUE;
1855
1856                         if (enable_rfc2047_workarounds) {
1857                                 if (!strncmp (inptr, "=?", 2)) {
1858                                         inptr += 2;
1859
1860                                         /* skip past the charset (if one is even declared, sigh) */
1861                                         while (*inptr && *inptr != '?') {
1862                                                 ascii = ascii && is_ascii (*inptr);
1863                                                 inptr++;
1864                                         }
1865
1866                                         /* sanity check encoding type */
1867                                         if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
1868                                                 goto non_rfc2047;
1869
1870                                         inptr += 3;
1871
1872                                         /* find the end of the rfc2047 encoded word token */
1873                                         while (*inptr && strncmp (inptr, "?=", 2) != 0) {
1874                                                 ascii = ascii && is_ascii (*inptr);
1875                                                 inptr++;
1876                                         }
1877
1878                                         if (!strncmp (inptr, "?=", 2))
1879                                                 inptr += 2;
1880                                 } else {
1881                                 non_rfc2047:
1882                                         /* stop if we encounter a possible rfc2047 encoded
1883                                          * token even if it's inside another word, sigh. */
1884                                         while (*inptr && !is_lwsp (*inptr) &&
1885                                                strncmp (inptr, "=?", 2) != 0) {
1886                                                 ascii = ascii && is_ascii (*inptr);
1887                                                 inptr++;
1888                                         }
1889                                 }
1890                         } else {
1891                                 while (*inptr && !is_lwsp (*inptr)) {
1892                                         ascii = ascii && is_ascii (*inptr);
1893                                         inptr++;
1894                                 }
1895                         }
1896
1897                         n = (size_t) (inptr - word);
1898                         if (is_rfc2047_encoded_word (word, n)) {
1899                                 if ((decoded = rfc2047_decode_word (word, n))) {
1900                                         /* rfc2047 states that you must ignore all
1901                                          * whitespace between encoded words */
1902                                         if (!encoded)
1903                                                 g_string_append_len (out, lwsp, nlwsp);
1904
1905                                         g_string_append (out, decoded);
1906                                         g_free (decoded);
1907
1908                                         encoded = TRUE;
1909                                 } else {
1910                                         /* append lwsp and invalid rfc2047 encoded-word token */
1911                                         g_string_append_len (out, lwsp, nlwsp + n);
1912                                         encoded = FALSE;
1913                                 }
1914                         } else {
1915                                 /* append lwsp */
1916                                 g_string_append_len (out, lwsp, nlwsp);
1917
1918                                 /* append word token */
1919                                 if (!ascii) {
1920                                         /* *sigh* I hate broken mailers... */
1921                                         decoded = g_mime_utils_decode_8bit (word, n);
1922                                         g_string_append (out, decoded);
1923                                         g_free (decoded);
1924                                 } else {
1925                                         g_string_append_len (out, word, n);
1926                                 }
1927
1928                                 encoded = FALSE;
1929                         }
1930                 } else {
1931                         /* appending trailing lwsp */
1932                         g_string_append_len (out, lwsp, nlwsp);
1933                         break;
1934                 }
1935         }
1936
1937         decoded = out->str;
1938         g_string_free (out, FALSE);
1939
1940         return decoded;
1941 }
1942
1943
1944 /**
1945  * g_mime_utils_header_decode_phrase:
1946  * @phrase: header to decode
1947  *
1948  * Decodes an rfc2047 encoded 'phrase' header.
1949  *
1950  * Note: See g_mime_set_user_charsets() for details on how charset
1951  * conversion is handled for unencoded 8bit text and/or wrongly
1952  * specified rfc2047 encoded-word tokens.
1953  *
1954  * Returns: a newly allocated UTF-8 string representing the the decoded
1955  * header.
1956  **/
1957 char *
1958 g_mime_utils_header_decode_phrase (const char *phrase)
1959 {
1960         register const char *inptr = phrase;
1961         gboolean encoded = FALSE;
1962         const char *lwsp, *text;
1963         size_t nlwsp, n;
1964         gboolean ascii;
1965         char *decoded;
1966         GString *out;
1967
1968         if (phrase == NULL)
1969                 return g_strdup ("");
1970
1971         out = g_string_sized_new (strlen (phrase) + 1);
1972
1973         while (*inptr != '\0') {
1974                 lwsp = inptr;
1975                 while (is_lwsp (*inptr))
1976                         inptr++;
1977
1978                 nlwsp = (size_t) (inptr - lwsp);
1979
1980                 text = inptr;
1981                 if (is_atom (*inptr)) {
1982                         while (is_atom (*inptr))
1983                                 inptr++;
1984
1985                         n = (size_t) (inptr - text);
1986                         if (is_rfc2047_encoded_word (text, n)) {
1987                                 if ((decoded = rfc2047_decode_word (text, n))) {
1988                                         /* rfc2047 states that you must ignore all
1989                                          * whitespace between encoded words */
1990                                         if (!encoded)
1991                                                 g_string_append_len (out, lwsp, nlwsp);
1992
1993                                         g_string_append (out, decoded);
1994                                         g_free (decoded);
1995
1996                                         encoded = TRUE;
1997                                 } else {
1998                                         /* append lwsp and invalid rfc2047 encoded-word token */
1999                                         g_string_append_len (out, lwsp, nlwsp + n);
2000                                         encoded = FALSE;
2001                                 }
2002                         } else {
2003                                 /* append lwsp and atom token */
2004                                 g_string_append_len (out, lwsp, nlwsp + n);
2005                                 encoded = FALSE;
2006                         }
2007                 } else {
2008                         g_string_append_len (out, lwsp, nlwsp);
2009
2010                         ascii = TRUE;
2011                         while (*inptr && !is_lwsp (*inptr)) {
2012                                 ascii = ascii && is_ascii (*inptr);
2013                                 inptr++;
2014                         }
2015
2016                         n = (size_t) (inptr - text);
2017
2018                         if (!ascii) {
2019                                 /* *sigh* I hate broken mailers... */
2020                                 decoded = g_mime_utils_decode_8bit (text, n);
2021                                 g_string_append (out, decoded);
2022                                 g_free (decoded);
2023                         } else {
2024                                 g_string_append_len (out, text, n);
2025                         }
2026
2027                         encoded = FALSE;
2028                 }
2029         }
2030
2031         decoded = out->str;
2032         g_string_free (out, FALSE);
2033
2034         return decoded;
2035 }
2036
2037
2038 /* rfc2047 version of quoted-printable */
2039 static size_t
2040 quoted_encode (const char *in, size_t len, unsigned char *out, gushort safemask)
2041 {
2042         register const unsigned char *inptr = (const unsigned char *) in;
2043         const unsigned char *inend = inptr + len;
2044         register unsigned char *outptr = out;
2045         unsigned char c;
2046
2047         while (inptr < inend) {
2048                 c = *inptr++;
2049                 if (c == ' ') {
2050                         *outptr++ = '_';
2051                 } else if (c != '_' && gmime_special_table[c] & safemask) {
2052                         *outptr++ = c;
2053                 } else {
2054                         *outptr++ = '=';
2055                         *outptr++ = tohex[(c >> 4) & 0xf];
2056                         *outptr++ = tohex[c & 0xf];
2057                 }
2058         }
2059
2060         return (outptr - out);
2061 }
2062
2063 static void
2064 rfc2047_encode_word (GString *string, const char *word, size_t len,
2065                      const char *charset, gushort safemask)
2066 {
2067         register char *inptr, *outptr;
2068         iconv_t cd = (iconv_t) -1;
2069         unsigned char *encoded;
2070         size_t enclen, pos;
2071         char *uword = NULL;
2072         guint32 save = 0;
2073         int state = 0;
2074         char encoding;
2075
2076         if (g_ascii_strcasecmp (charset, "UTF-8") != 0)
2077                 cd = g_mime_iconv_open (charset, "UTF-8");
2078
2079         if (cd != (iconv_t) -1) {
2080                 uword = g_mime_iconv_strndup (cd, (char *) word, len);
2081                 g_mime_iconv_close (cd);
2082         }
2083
2084         if (uword) {
2085                 len = strlen (uword);
2086                 word = uword;
2087         } else {
2088                 charset = "UTF-8";
2089         }
2090
2091         switch (g_mime_utils_best_encoding ((const unsigned char *) word, len)) {
2092         case GMIME_CONTENT_ENCODING_BASE64:
2093                 enclen = GMIME_BASE64_ENCODE_LEN (len);
2094                 encoded = g_alloca (enclen + 1);
2095
2096                 encoding = 'b';
2097
2098                 pos = g_mime_encoding_base64_encode_close ((const unsigned char *) word, len, encoded, &state, &save);
2099                 encoded[pos] = '\0';
2100
2101                 /* remove \n chars as headers need to be wrapped differently */
2102                 if (G_UNLIKELY ((inptr = strchr ((char *) encoded, '\n')))) {
2103                         outptr = inptr++;
2104                         while (G_LIKELY (*inptr)) {
2105                                 if (G_LIKELY (*inptr != '\n'))
2106                                         *outptr++ = *inptr;
2107
2108                                 inptr++;
2109                         }
2110
2111                         *outptr = '\0';
2112                 }
2113
2114                 break;
2115         case GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE:
2116                 enclen = GMIME_QP_ENCODE_LEN (len);
2117                 encoded = g_alloca (enclen + 1);
2118
2119                 encoding = 'q';
2120
2121                 pos = quoted_encode (word, len, encoded, safemask);
2122                 encoded[pos] = '\0';
2123
2124                 break;
2125         default:
2126                 encoded = NULL;
2127                 encoding = '\0';
2128                 g_assert_not_reached ();
2129         }
2130
2131         g_free (uword);
2132
2133         g_string_append_printf (string, "=?%s?%c?%s?=", charset, encoding, encoded);
2134 }
2135
2136
2137 typedef enum {
2138         WORD_ATOM,
2139         WORD_QSTRING,
2140         WORD_2047
2141 } rfc822_word_t;
2142
2143 typedef struct _rfc822_word {
2144         struct _rfc822_word *next;
2145         const char *start, *end;
2146         rfc822_word_t type;
2147         int encoding;
2148 } rfc822_word;
2149
2150 #define rfc822_word_free(word) g_slice_free (rfc822_word, word)
2151 #define rfc822_word_new() g_slice_new (rfc822_word)
2152
2153 /* okay, so 'unstructured text' fields don't actually contain 'word'
2154  * tokens, but we can group stuff similarly... */
2155 static rfc822_word *
2156 rfc2047_encode_get_rfc822_words (const char *in, gboolean phrase)
2157 {
2158         rfc822_word *words, *tail, *word;
2159         rfc822_word_t type = WORD_ATOM;
2160         const char *inptr, *start, *last;
2161         int count = 0, encoding = 0;
2162
2163         words = NULL;
2164         tail = (rfc822_word *) &words;
2165
2166         last = start = inptr = in;
2167         while (inptr && *inptr) {
2168                 const char *newinptr;
2169                 gunichar c;
2170
2171                 newinptr = g_utf8_next_char (inptr);
2172                 c = g_utf8_get_char (inptr);
2173                 if (newinptr == NULL || !g_unichar_validate (c)) {
2174                         w(g_warning ("Invalid UTF-8 sequence encountered"));
2175                         inptr++;
2176                         continue;
2177                 }
2178
2179                 inptr = newinptr;
2180
2181                 if (c < 256 && is_lwsp (c)) {
2182                         if (count > 0) {
2183                                 word = rfc822_word_new ();
2184                                 word->next = NULL;
2185                                 word->start = start;
2186                                 word->end = last;
2187                                 word->type = type;
2188                                 word->encoding = encoding;
2189
2190                                 tail->next = word;
2191                                 tail = word;
2192                                 count = 0;
2193                         }
2194
2195                         start = inptr;
2196                         type = WORD_ATOM;
2197                         encoding = 0;
2198                 } else {
2199                         count++;
2200                         if (phrase && c < 128) {
2201                                 /* phrases can have qstring words */
2202                                 if (!is_atom (c))
2203                                         type = MAX (type, WORD_QSTRING);
2204                         } else if (c > 127 && c < 256) {
2205                                 type = WORD_2047;
2206                                 encoding = MAX (encoding, 1);
2207                         } else if (c >= 256) {
2208                                 type = WORD_2047;
2209                                 encoding = 2;
2210                         }
2211
2212                         if (count >= GMIME_FOLD_PREENCODED) {
2213                                 word = rfc822_word_new ();
2214                                 word->next = NULL;
2215                                 word->start = start;
2216                                 word->end = inptr;
2217                                 word->type = type;
2218                                 word->encoding = encoding;
2219
2220                                 tail->next = word;
2221                                 tail = word;
2222                                 count = 0;
2223
2224                                 /* Note: don't reset 'type' as it
2225                                  * needs to be preserved when breaking
2226                                  * long words */
2227                                 start = inptr;
2228                                 encoding = 0;
2229                         }
2230                 }
2231
2232                 last = inptr;
2233         }
2234
2235         if (count > 0) {
2236                 word = rfc822_word_new ();
2237                 word->next = NULL;
2238                 word->start = start;
2239                 word->end = last;
2240                 word->type = type;
2241                 word->encoding = encoding;
2242
2243                 tail->next = word;
2244                 tail = word;
2245         }
2246
2247 #if d(!)0
2248         printf ("rfc822 word tokens:\n");
2249         word = words;
2250         while (word) {
2251                 printf ("\t'%.*s'; type=%d, encoding=%d\n",
2252                         word->end - word->start, word->start,
2253                         word->type, word->encoding);
2254
2255                 word = word->next;
2256         }
2257 #endif
2258
2259         return words;
2260 }
2261
2262 #define MERGED_WORD_LT_FOLDLEN(wlen, type) ((type) == WORD_2047 ? (wlen) < GMIME_FOLD_PREENCODED : (wlen) < (GMIME_FOLD_LEN - 8))
2263
2264 static gboolean
2265 should_merge_words (rfc822_word *word, rfc822_word *next)
2266 {
2267         switch (word->type) {
2268         case WORD_ATOM:
2269                 if (next->type == WORD_2047)
2270                         return FALSE;
2271
2272                 return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, next->type));
2273         case WORD_QSTRING:
2274                 /* avoid merging with words that need to be rfc2047 encoded */
2275                 if (next->type == WORD_2047)
2276                         return FALSE;
2277
2278                 return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, WORD_QSTRING));
2279         case WORD_2047:
2280                 if (next->type == WORD_ATOM) {
2281                         /* whether we merge or not is dependent upon:
2282                          * 1. the number of atoms in a row after 'word'
2283                          * 2. if there is another encword after the string of atoms.
2284                          */
2285                         int natoms = 0;
2286
2287                         while (next && next->type == WORD_ATOM) {
2288                                 next = next->next;
2289                                 natoms++;
2290                         }
2291
2292                         /* if all the words after the encword are atoms, don't merge */
2293                         if (!next || natoms > 3)
2294                                 return FALSE;
2295                 }
2296
2297                 /* avoid merging with qstrings */
2298                 if (next->type == WORD_QSTRING)
2299                         return FALSE;
2300
2301                 return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, WORD_2047));
2302         default:
2303                 return FALSE;
2304         }
2305 }
2306
2307 static void
2308 rfc2047_encode_merge_rfc822_words (rfc822_word **wordsp)
2309 {
2310         rfc822_word *word, *next, *words = *wordsp;
2311
2312         /* first pass: merge qstrings with adjacent qstrings and encwords with adjacent encwords */
2313         word = words;
2314         while (word && word->next) {
2315                 next = word->next;
2316
2317                 if (word->type != WORD_ATOM && word->type == next->type &&
2318                     MERGED_WORD_LT_FOLDLEN (next->end - word->start, word->type)) {
2319                         /* merge the words */
2320                         word->encoding = MAX (word->encoding, next->encoding);
2321
2322                         word->end = next->end;
2323                         word->next = next->next;
2324
2325                         rfc822_word_free (next);
2326
2327                         next = word;
2328                 }
2329
2330                 word = next;
2331         }
2332
2333         /* second pass: now merge atoms with the other words */
2334         word = words;
2335         while (word && word->next) {
2336                 next = word->next;
2337
2338                 if (should_merge_words (word, next)) {
2339                         /* the resulting word type is the MAX of the 2 types */
2340                         word->type = MAX (word->type, next->type);
2341
2342                         word->encoding = MAX (word->encoding, next->encoding);
2343
2344                         word->end = next->end;
2345                         word->next = next->next;
2346
2347                         rfc822_word_free (next);
2348
2349                         continue;
2350                 }
2351
2352                 word = next;
2353         }
2354
2355         *wordsp = words;
2356 }
2357
2358 static void
2359 g_string_append_len_quoted (GString *out, const char *in, size_t len)
2360 {
2361         register const char *inptr;
2362         const char *inend;
2363
2364         g_string_append_c (out, '"');
2365
2366         inptr = in;
2367         inend = in + len;
2368
2369         while (inptr < inend) {
2370                 if (*inptr == '"' || *inptr == '\\')
2371                         g_string_append_c (out, '\\');
2372
2373                 g_string_append_c (out, *inptr);
2374
2375                 inptr++;
2376         }
2377
2378         g_string_append_c (out, '"');
2379 }
2380
2381 static char *
2382 rfc2047_encode (const char *in, gushort safemask)
2383 {
2384         rfc822_word *words, *word, *prev = NULL;
2385         const char **charsets, *charset;
2386         const char *start;
2387         GMimeCharset mask;
2388         GString *out;
2389         char *outstr;
2390         size_t len;
2391         int i;
2392
2393         if (!(words = rfc2047_encode_get_rfc822_words (in, safemask & IS_PSAFE)))
2394                 return g_strdup (in);
2395
2396         rfc2047_encode_merge_rfc822_words (&words);
2397
2398         charsets = g_mime_user_charsets ();
2399
2400         out = g_string_new ("");
2401
2402         /* output words now with spaces between them */
2403         word = words;
2404         while (word) {
2405                 /* append correct number of spaces between words */
2406                 if (prev && !(prev->type == WORD_2047 && word->type == WORD_2047)) {
2407                         /* one or both of the words are not encoded so we write the spaces out untouched */
2408                         len = word->start - prev->end;
2409                         g_string_append_len (out, prev->end, len);
2410                 }
2411
2412                 switch (word->type) {
2413                 case WORD_ATOM:
2414                         g_string_append_len (out, word->start, (size_t) (word->end - word->start));
2415                         break;
2416                 case WORD_QSTRING:
2417                         g_assert (safemask & IS_PSAFE);
2418                         g_string_append_len_quoted (out, word->start, (size_t) (word->end - word->start));
2419                         break;
2420                 case WORD_2047:
2421                         if (prev && prev->type == WORD_2047) {
2422                                 /* include the whitespace chars between these 2 words in the
2423                                    resulting rfc2047 encoded word. */
2424                                 len = word->end - prev->end;
2425                                 start = prev->end;
2426
2427                                 /* encoded words need to be separated by linear whitespace */
2428                                 g_string_append_c (out, ' ');
2429                         } else {
2430                                 len = word->end - word->start;
2431                                 start = word->start;
2432                         }
2433
2434                         switch (word->encoding) {
2435                         case 0: /* us-ascii */
2436                                 rfc2047_encode_word (out, start, len, "us-ascii", safemask);
2437                                 break;
2438                         case 1: /* iso-8859-1 */
2439                                 rfc2047_encode_word (out, start, len, "iso-8859-1", safemask);
2440                                 break;
2441                         default:
2442                                 charset = NULL;
2443                                 g_mime_charset_init (&mask);
2444                                 g_mime_charset_step (&mask, start, len);
2445
2446                                 for (i = 0; charsets && charsets[i]; i++) {
2447                                         if (g_mime_charset_can_encode (&mask, charsets[i], start, len)) {
2448                                                 charset = charsets[i];
2449                                                 break;
2450                                         }
2451                                 }
2452
2453                                 if (!charset)
2454                                         charset = g_mime_charset_best_name (&mask);
2455
2456                                 rfc2047_encode_word (out, start, len, charset, safemask);
2457                                 break;
2458                         }
2459
2460                         break;
2461                 }
2462
2463                 rfc822_word_free (prev);
2464
2465                 prev = word;
2466                 word = word->next;
2467         }
2468
2469         rfc822_word_free (prev);
2470
2471         outstr = out->str;
2472         g_string_free (out, FALSE);
2473
2474         return outstr;
2475 }
2476
2477
2478 /**
2479  * g_mime_utils_header_encode_phrase:
2480  * @phrase: phrase to encode
2481  *
2482  * Encodes a 'phrase' header according to the rules in rfc2047.
2483  *
2484  * Returns: the encoded 'phrase'. Useful for encoding internet
2485  * addresses.
2486  **/
2487 char *
2488 g_mime_utils_header_encode_phrase (const char *phrase)
2489 {
2490         if (phrase == NULL)
2491                 return NULL;
2492
2493         return rfc2047_encode (phrase, IS_PSAFE);
2494 }
2495
2496
2497 /**
2498  * g_mime_utils_header_encode_text:
2499  * @text: text to encode
2500  *
2501  * Encodes a 'text' header according to the rules in rfc2047.
2502  *
2503  * Returns: the encoded header. Useful for encoding
2504  * headers like "Subject".
2505  **/
2506 char *
2507 g_mime_utils_header_encode_text (const char *text)
2508 {
2509         if (text == NULL)
2510                 return NULL;
2511
2512         return rfc2047_encode (text, IS_ESAFE);
2513 }