From: William Morgan Date: Sun, 17 May 2009 18:37:43 +0000 (-0700) Subject: check for a correct date on mbox From lines X-Git-Url: https://git.notmuchmail.org/git?a=commitdiff_plain;h=67f4b1d32fa1dce2704c7bc21eb461875356962c;p=sup check for a correct date on mbox From lines Determine a splitting line by looking for a From, a something else, and a parseable date. --- diff --git a/lib/sup/mbox.rb b/lib/sup/mbox.rb index 33a8adb..9bd10ad 100644 --- a/lib/sup/mbox.rb +++ b/lib/sup/mbox.rb @@ -10,9 +10,23 @@ module Redwood ## ## TODO: move functionality to somewhere better, like message.rb module MBox - BREAK_RE = /^From \S+@\S+ / + BREAK_RE = /^From \S+ (.+)$/ HEADER_RE = /\s*(.*?)\s*/ + def is_break_line? l + l =~ BREAK_RE or return false + time = $1 + begin + ## hack -- make Time.parse fail when trying to substitute values from Time.now + Time.parse time, 0 + true + rescue NoMethodError + Redwood::log "found invalid date in potential mbox split line, not splitting: #{l.inspect}" + false + end + end + module_function :is_break_line? + def read_header f header = {} last = nil @@ -70,7 +84,7 @@ module MBox def read_body f body = [] f.each_line do |l| - break if l =~ BREAK_RE + break if is_break_line?(l) body << l.chomp end body diff --git a/lib/sup/mbox/loader.rb b/lib/sup/mbox/loader.rb index c500f36..57d983d 100644 --- a/lib/sup/mbox/loader.rb +++ b/lib/sup/mbox/loader.rb @@ -56,7 +56,7 @@ class Loader < Source @mutex.synchronize do @f.seek offset l = @f.gets - unless l =~ BREAK_RE + unless MBox::is_break_line? l raise OutOfSyncSourceError, "mismatch in mbox file offset #{offset.inspect}: #{l.inspect}." end header = MBox::read_header @f @@ -72,7 +72,7 @@ class Loader < Source ## "From" at the start of a message body line. string = "" l = @f.gets - string << l until @f.eof? || (l = @f.gets) =~ BREAK_RE + string << l until @f.eof? || MBox::is_break_line?(l = @f.gets) RMail::Parser.read string rescue RMail::Parser::Error => e raise FatalSourceError, "error parsing mbox file: #{e.message}" @@ -107,7 +107,7 @@ class Loader < Source @mutex.synchronize do @f.seek offset yield @f.gets - until @f.eof? || (l = @f.gets) =~ BREAK_RE + until @f.eof? || MBox::is_break_line?(l = @f.gets) yield l end end @@ -138,7 +138,7 @@ class Loader < Source end while(line = @f.gets) - break if line =~ BREAK_RE + break if MBox::is_break_line? line next_offset = @f.tell end end diff --git a/test/test_mbox_parsing.rb b/test/test_mbox_parsing.rb index 32687e5..dbff2e2 100644 --- a/test/test_mbox_parsing.rb +++ b/test/test_mbox_parsing.rb @@ -115,4 +115,54 @@ EOS assert_equal "Bob ", h["From"] assert_nil h["To"] end + + def test_from_line_splitting + l = MBox::Loader.new StringIO.new(< +To: a dear friend + +Hello there friend. How are you? + +From sea to shining sea + +From bob@bob.com I get only spam. + +From bob@bob.com + +From bob@bob.com + +(that second one has spaces at the endj + +This is the end of the email. +EOS + offset, labels = l.next + assert_equal 0, offset + offset, labels = l.next + assert_nil offset + end + + def test_more_from_line_splitting + l = MBox::Loader.new StringIO.new(< +To: a dear friend + +Hello there friend. How are you? + +From bob@bob.com Mon Apr 27 12:56:19 2009 +From: Bob +To: a dear friend + +Hello again! Would you like to buy my products? +EOS + offset, labels = l.next + assert_not_nil offset + + offset, labels = l.next + assert_not_nil offset + + offset, labels = l.next + assert_nil offset + end end