From: William Morgan Date: Mon, 18 May 2009 14:10:27 +0000 (-0700) Subject: Merge branch 'various-mbox-fixes' into next X-Git-Url: https://git.notmuchmail.org/git?a=commitdiff_plain;h=47133afaad9c0a0eb34cf1b4fb77a251388a4359;p=sup Merge branch 'various-mbox-fixes' into next Conflicts: lib/sup/mbox.rb test/test_mbox_parsing.rb --- 47133afaad9c0a0eb34cf1b4fb77a251388a4359 diff --cc lib/sup/mbox.rb index 53b4e8c,9bd10ad..0d941b1 --- a/lib/sup/mbox.rb +++ b/lib/sup/mbox.rb @@@ -5,7 -5,91 +5,21 @@@ require "sup/rfc2047 module Redwood -## some utility functions. actually these are not mbox-specific at all -## and should be moved somewhere else. -## -## TODO: move functionality to somewhere better, like message.rb module MBox - BREAK_RE = /^From \S+@\S+ / + BREAK_RE = /^From \S+ (.+)$/ - HEADER_RE = /\s*(.*?)\s*/ + + def is_break_line? l + l =~ BREAK_RE or return false + time = $1 + begin + ## hack -- make Time.parse fail when trying to substitute values from Time.now + Time.parse time, 0 + true + rescue NoMethodError + Redwood::log "found invalid date in potential mbox split line, not splitting: #{l.inspect}" + false + end + end + module_function :is_break_line? - - def read_header f - header = {} - last = nil - - ## i do it in this weird way because i am trying to speed things up - ## when scanning over large mbox files. - while(line = f.gets) - case line - ## these three can occur multiple times, and we want the first one - when /^(Delivered-To):#{HEADER_RE}$/i, - /^(X-Original-To):#{HEADER_RE}$/i, - /^(Envelope-To):#{HEADER_RE}$/i: header[last = $1] ||= $2 - - when /^(From):#{HEADER_RE}$/i, - /^(To):#{HEADER_RE}$/i, - /^(Cc):#{HEADER_RE}$/i, - /^(Bcc):#{HEADER_RE}$/i, - /^(Subject):#{HEADER_RE}$/i, - /^(Date):#{HEADER_RE}$/i, - /^(References):#{HEADER_RE}$/i, - /^(In-Reply-To):#{HEADER_RE}$/i, - /^(Reply-To):#{HEADER_RE}$/i, - /^(List-Post):#{HEADER_RE}$/i, - /^(List-Subscribe):#{HEADER_RE}$/i, - /^(List-Unsubscribe):#{HEADER_RE}$/i, - /^(Status):#{HEADER_RE}$/i, - /^(X-\S+):#{HEADER_RE}$/: header[last = $1] = $2 - when /^(Message-Id):#{HEADER_RE}$/i: header[mid_field = last = $1] = $2 - - when /^\r*$/: break - when /^\S+:/: last = nil # some other header we don't care about - else - header[last] += " " + line.chomp.gsub(/^\s+/, "") if last - end - end - - if mid_field && header[mid_field] && header[mid_field] =~ /<(.*?)>/ - header[mid_field] = $1 - end - - header.each do |k, v| - next unless Rfc2047.is_encoded? v - header[k] = - begin - Rfc2047.decode_to $encoding, v - rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::IllegalSequence => e - Redwood::log "warning: error decoding RFC 2047 header (#{e.class.name}): #{e.message}" - v - end - end - header - end - - ## never actually called - def read_body f - body = [] - f.each_line do |l| - break if is_break_line?(l) - body << l.chomp - end - body - end - - module_function :read_header, :read_body end end diff --cc lib/sup/mbox/loader.rb index c35d0c8,57d983d..f499827 --- a/lib/sup/mbox/loader.rb +++ b/lib/sup/mbox/loader.rb @@@ -56,10 -56,10 +56,10 @@@ class Loader < Sourc @mutex.synchronize do @f.seek offset l = @f.gets - unless l =~ BREAK_RE + unless MBox::is_break_line? l raise OutOfSyncSourceError, "mismatch in mbox file offset #{offset.inspect}: #{l.inspect}." end - header = MBox::read_header @f + header = parse_raw_email_header @f end header end @@@ -80,18 -80,6 +80,18 @@@ end end + ## scan forward until we're at the valid start of a message + def correct_offset! + @mutex.synchronize do + @f.seek cur_offset + string = "" - until @f.eof? || (l = @f.gets) =~ BREAK_RE ++ until @f.eof? || MBox::is_break_line?(l = @f.gets) + string << l + end + self.cur_offset += string.length + end + end + def raw_header offset ret = "" @mutex.synchronize do diff --cc test/test_header_parsing.rb index 7368d81,0000000..91cf7c7 mode 100644,000000..100644 --- a/test/test_header_parsing.rb +++ b/test/test_header_parsing.rb @@@ -1,107 -1,0 +1,157 @@@ +#!/usr/bin/ruby + +require 'test/unit' +require 'sup' +require 'stringio' + +include Redwood + +class TestMBoxParsing < Test::Unit::TestCase + def setup + end + + def teardown + end + + def test_normal_headers + h = Source.parse_raw_email_header StringIO.new(< +To: Sally +EOS + + assert_equal "Bob ", h["from"] + assert_equal "Sally ", h["to"] + assert_nil h["message-id"] + end + + def test_multiline + h = Source.parse_raw_email_header StringIO.new(< +Subject: one two three + four five six +To: Sally +References: + +Seven: Eight +EOS + + assert_equal "one two three four five six", h["subject"] + assert_equal "Sally ", h["to"] + assert_equal " ", h["references"] + end + + def test_ignore_spacing + variants = [ + "Subject:one two three end\n", + "Subject: one two three end\n", + "Subject: one two three end \n", + ] + variants.each do |s| + h = Source.parse_raw_email_header StringIO.new(s) + assert_equal "one two three end", h["subject"] + end + end + + def test_message_id_ignore_spacing + variants = [ + "Message-Id: \n", + "Message-Id: \n", + ] + variants.each do |s| + h = Source.parse_raw_email_header StringIO.new(s) + assert_equal "", h["message-id"] + end + end + + def test_blank_lines + h = Source.parse_raw_email_header StringIO.new("") + assert_equal nil, h["message-id"] + end + + def test_empty_headers + variants = [ + "Message-Id: \n", + "Message-Id:\n", + ] + variants.each do |s| + h = Source.parse_raw_email_header StringIO.new(s) + assert_equal "", h["message-id"] + end + end + + def test_detect_end_of_headers + h = Source.parse_raw_email_header StringIO.new(< + +To: a dear friend +EOS + assert_equal "Bob ", h["from"] + assert_nil h["to"] + + h = Source.parse_raw_email_header StringIO.new(< +\r +To: a dear friend +EOS + assert_equal "Bob ", h["from"] + assert_nil h["to"] + + h = Source.parse_raw_email_header StringIO.new(< +\r\n\r +To: a dear friend +EOS + assert_equal "Bob ", h["from"] + assert_nil h["to"] + end ++ ++ def test_from_line_splitting ++ l = MBox::Loader.new StringIO.new(< ++To: a dear friend ++ ++Hello there friend. How are you? ++ ++From sea to shining sea ++ ++From bob@bob.com I get only spam. ++ ++From bob@bob.com ++ ++From bob@bob.com ++ ++(that second one has spaces at the endj ++ ++This is the end of the email. ++EOS ++ offset, labels = l.next ++ assert_equal 0, offset ++ offset, labels = l.next ++ assert_nil offset ++ end ++ ++ def test_more_from_line_splitting ++ l = MBox::Loader.new StringIO.new(< ++To: a dear friend ++ ++Hello there friend. How are you? ++ ++From bob@bob.com Mon Apr 27 12:56:19 2009 ++From: Bob ++To: a dear friend ++ ++Hello again! Would you like to buy my products? ++EOS ++ offset, labels = l.next ++ assert_not_nil offset ++ ++ offset, labels = l.next ++ assert_not_nil offset ++ ++ offset, labels = l.next ++ assert_nil offset ++ end +end