From 9d19facb64020cb299af59c8d8e552c90a0021d1 Mon Sep 17 00:00:00 2001 From: wmorgan Date: Sun, 20 May 2007 21:07:33 +0000 Subject: [PATCH] refactor threading (much nicer now). thread by subject is now configurable and defaults to off, which is much faster. git-svn-id: svn://rubyforge.org/var/svn/sup/trunk@404 5c8cc53c-5e98-4d25-b20a-d8db53a31250 --- lib/sup.rb | 1 + lib/sup/index.rb | 12 +- lib/sup/modes/thread-index-mode.rb | 2 +- lib/sup/thread.rb | 218 +++++++++++++---------------- 4 files changed, 104 insertions(+), 129 deletions(-) diff --git a/lib/sup.rb b/lib/sup.rb index fd898ae..11567f7 100644 --- a/lib/sup.rb +++ b/lib/sup.rb @@ -160,6 +160,7 @@ else } }, :editor => ENV["EDITOR"] || "/usr/bin/vi", + :thread_by_subject => false, } begin FileUtils.mkdir_p Redwood::BASE_DIR diff --git a/lib/sup/index.rb b/lib/sup/index.rb index 19d4ed6..cd7c775 100644 --- a/lib/sup/index.rb +++ b/lib/sup/index.rb @@ -148,20 +148,19 @@ class Index end ## yield all messages in the thread containing 'm' by repeatedly - ## querying the index. uields pairs of message ids and + ## querying the index. yields pairs of message ids and ## message-building lambdas, so that building an unwanted message ## can be skipped in the block if desired. ## ## stops loading any thread if a message with a :killed flag is found. - SAME_SUBJECT_DATE_LIMIT = 7 def each_message_in_thread_for m, opts={} + Redwood::log "Building thread for #{m.id}: #{m.subj}" messages = {} searched = {} num_queries = 0 - ## todo: make subject querying configurable - if true # do subject queries + if $config[:thread_by_subject] # do subject queries date_min = m.date - (SAME_SUBJECT_DATE_LIMIT * 12 * 3600) date_max = m.date + (SAME_SUBJECT_DATE_LIMIT * 12 * 3600) @@ -196,14 +195,15 @@ class Index break if opts[:limit] && messages.size >= opts[:limit] break if @index[docid][:label].split(/\s+/).include? "killed" unless opts[:load_killed] mid = @index[docid][:message_id] - unless messages.member? mid + unless id == mid || messages.member?(mid) + Redwood::log "got #{mid} as a child of #{id}" messages[mid] ||= lambda { build_message docid } refs = @index[docid][:refs].split(" ") pending += refs end end end - Redwood::log "ran #{num_queries} queries to build thread of #{messages.size} messages for #{m.id}" if num_queries > 0 + Redwood::log "ran #{num_queries} queries to build thread of #{messages.size + 1} messages for #{m.id}" if num_queries > 0 messages.each { |mid, builder| yield mid, builder } end diff --git a/lib/sup/modes/thread-index-mode.rb b/lib/sup/modes/thread-index-mode.rb index 089e0e8..b89079b 100644 --- a/lib/sup/modes/thread-index-mode.rb +++ b/lib/sup/modes/thread-index-mode.rb @@ -455,7 +455,7 @@ protected private def initialize_threads - @ts = ThreadSet.new Index.instance + @ts = ThreadSet.new Index.instance, $config[:thread_by_subject] @ts_mutex = Mutex.new @hidden_threads = {} end diff --git a/lib/sup/thread.rb b/lib/sup/thread.rb index 85e503c..88611a1 100644 --- a/lib/sup/thread.rb +++ b/lib/sup/thread.rb @@ -1,24 +1,28 @@ -## Herein is all the code responsible for threading messages. I use an -## online version of the JWZ threading algorithm: +## Herein lies all the code responsible for threading messages. It's +## basically an online version of the JWZ threading algorithm: ## http://www.jwz.org/doc/threading.html ## -## I certainly didn't implement it for efficiency, but thanks to our -## search engine backend, it's typically not applied to very many -## messages at once. - -## At the top level, we have a ThreadSet. A ThreadSet represents a set -## of threads, e.g. a message folder or an inbox. Each ThreadSet -## contains zero or more Threads. A Thread represents all the message -## related to a particular subject. Each Thread has one or more -## Containers. A Container is a recursive structure that holds the -## tree structure as determined by the references: and in-reply-to: -## headers. A Thread with multiple Containers occurs if they have the -## same subject, but (most likely due to someone using a primitive -## MUA) we don't have evidence from in-reply-to: or references: -## headers, only subject: (and thus our tree is probably broken). A -## Container holds zero or one message. In the case of no message, it -## means we've seen a reference to the message but haven't seen the -## message itself (yet). +## I didn't implement it for efficiency, but thanks to our search +## engine backend, it's typically not applied to very many messages at +## once. +## +## At the top level, we have a ThreadSet, which represents a set of +## threads, e.g. a message folder or an inbox. Each ThreadSet contains +## zero or more Threads. A Thread represents all the message related +## to a particular subject. Each Thread has one or more Containers. A +## Container is a recursive structure that holds the message tree as +## determined by the references: and in-reply-to: headers. EAch +## Container holds zero or one messages. In the case of zero messages, +## it means we've seen a reference to the message but haven't (yet) +## seen the message itself. +## +## A Thread can have multiple top-level Containers if we decide to +## group them together independent of tree structure, typically if +## (e.g. due to someone using a primitive MUA) the messages have the +## same subject but we don't have evidence from in-reply-to: or +## references: headers. In this case Thread#each can optionally yield +## a faked root object tying them all together into one tree +## structure. module Redwood @@ -38,20 +42,20 @@ class Thread end def empty?; @containers.empty?; end - - def drop c - raise "bad drop" unless @containers.member? c - @containers.delete c + def empty!; @containers.clear; end + def drop c; @containers.delete(c) or raise "bad drop"; end + + ## unused + def dump f=$stdout + f.puts "=== start thread #{self} with #{@containers.length} trees ===" + @containers.each { |c| c.dump_recursive f } + f.puts "=== end thread ===" end - def dump - puts "=== start thread #{self} with #{@containers.length} trees ===" - @containers.each { |c| c.dump_recursive } - puts "=== end thread ===" - end - - ## yields each message, its depth, and its parent. note that the - ## message can be a Message object, or :fake_root, or nil. + ## yields each message, its depth, and its parent. the message yield + ## parameter can be a Message object, or :fake_root, or nil (no + ## message found but the presence of one induced from other + ## messages). def each fake_root=false adj = 0 root = @containers.find_all { |c| !Message.subj_is_reply?(c) }.argmin { |c| c.date || 0 } @@ -201,11 +205,11 @@ class Container ].compact.join(" ") + ">" end - def dump_recursive indent=0, root=true, parent=nil + def dump_recursive f=$stdout, indent=0, root=true, parent=nil raise "inconsistency" unless parent.nil? || parent.children.include?(self) unless root - print " " * indent - print "+->" + f.print " " * indent + f.print "+->" end line = #"[#{useful? ? 'U' : ' '}] " + if @message @@ -214,22 +218,31 @@ class Container "" end - puts "#{id} #{line}"#[0 .. (105 - indent)] + f.puts "#{id} #{line}"#[0 .. (105 - indent)] indent += 3 - @children.each { |c| c.dump_recursive indent, false, self } + @children.each { |c| c.dump_recursive f, indent, false, self } end end -## a set of threads (so a forest). builds the thread structures by -## reading messages from an index. +## A set of threads (so a forest). Builds thread structures by reading +## messages from an index. +## +## If 'thread_by_subj' is true, puts messages with the same subject in +## one thread, even if they don't reference each other. This is +## helpful for crappy MUAs that don't set In-reply-to: or References: +## headers, but means that messages may be threaded unnecessarily. class ThreadSet attr_reader :num_messages + bool_reader :thread_by_subj - def initialize index + def initialize index, thread_by_subj=true @index = index @num_messages = 0 - @messages = {} ## map from message ids to container objects - @subj_thread = {} ## map from subject strings to thread objects + ## map from message ids to container objects + @messages = SavingHash.new { |id| Container.new id } + ## map from subject strings or (or root message ids) to thread objects + @threads = SavingHash.new { Thread.new } + @thread_by_subj = thread_by_subj end def contains_id? id; @messages.member?(id) && !@messages[id].empty?; end @@ -238,19 +251,20 @@ class ThreadSet end def delete_cruft - @subj_thread.each { |k, v| @subj_thread.delete(k) if v.empty? || v.subj != k } + @threads.each { |k, v| @threads.delete(k) if v.empty? } end private :delete_cruft - def threads; delete_cruft; @subj_thread.values; end - def size; delete_cruft; @subj_thread.size; end + def threads; delete_cruft; @threads.values; end + def size; delete_cruft; @threads.size; end - def dump - @subj_thread.each do |s, t| - puts "**********************" - puts "** for subject #{s} **" - puts "**********************" - t.dump + ## unused + def dump f + @threads.each do |s, t| + f.puts "**********************" + f.puts "** for subject #{s} **" + f.puts "**********************" + t.dump f end end @@ -291,7 +305,7 @@ class ThreadSet m = builder.call add_message m load_thread_for_message m, :load_killed => opts[:load_killed] - yield @subj_thread.size if block_given? + yield @threads.size if block_given? end end @@ -305,7 +319,7 @@ class ThreadSet ## merges in a pre-loaded thread def add_thread t - raise "duplicate" if @subj_thread.values.member? t + raise "duplicate" if @threads.values.member? t t.each { |m, *o| add_message m } end @@ -313,11 +327,10 @@ class ThreadSet m.refs.any? { |ref_id| @messages[ref_id] } end - ## an "online" version of the jwz threading algorithm. + ## the heart of the threading code def add_message message - id = message.id - el = (@messages[id] ||= Container.new id) - return if @messages[id].message # we've seen it before + el = @messages[message.id] + return if el.message # we've seen it before el.message = message oldroot = el.root @@ -325,8 +338,7 @@ class ThreadSet ## link via references: prev = nil message.refs.each do |ref_id| - raise "non-String ref id #{ref_id.inspect} (full: #{message.refs.inspect})" unless ref_id.is_a?(String) - ref = (@messages[ref_id] ||= Container.new ref_id) + ref = @messages[ref_id] link prev, ref if prev prev = ref end @@ -334,81 +346,43 @@ class ThreadSet ## link via in-reply-to: message.replytos.each do |ref_id| - ref = (@messages[ref_id] ||= Container.new ref_id) + ref = @messages[ref_id] link ref, el, true break # only do the first one end - ## update subject grouping root = el.root - # puts "> have #{el}, root #{root}, oldroot #{oldroot}" - # el.dump_recursive - - if root == oldroot - if oldroot.thread - ## check to see if the subject is still the same (in the case - ## that we first added a child message with a different - ## subject) - - ## this code is duplicated below. sorry! TODO: refactor - s = Message.normalize_subj(root.subj) - unless @subj_thread[s] == root.thread - ## Redwood::log "[1] moving thread to new subject #{root.subj}" - if @subj_thread[s] - @subj_thread[s] << root - root.thread = @subj_thread[s] - else - @subj_thread[s] = root.thread - end - end + ## new root. need to drop old one and put this one in its place + if root != oldroot && oldroot.thread + oldroot.thread.drop oldroot + oldroot.thread = nil + end + + key = + if thread_by_subj? + Message.normalize_subj root.subj else - ## to disable subject grouping, use the next line instead - ## (and the same for below) - #Redwood::log "[1] for #{root}, subject #{Message.normalize_subj(root.subj)} has #{@subj_thread[Message.normalize_subj(root.subj)] ? 'a' : 'no'} thread" - thread = (@subj_thread[Message.normalize_subj(root.subj)] ||= Thread.new) - #thread = (@subj_thread[root.id] ||= Thread.new) - - thread << root - root.thread = thread - # Redwood::log "[1] added #{root} to #{thread}" - end - else - if oldroot.thread - ## new root. need to drop old one and put this one in its place - oldroot.thread.drop oldroot - oldroot.thread = nil + root.id end - if root.thread - ## check to see if the subject is still the same (in the case - ## that we first added a child message with a different - ## subject) - s = Message.normalize_subj(root.subj) - unless @subj_thread[s] == root.thread - # Redwood::log "[2] moving thread to new subject #{root.subj}" - if @subj_thread[s] - @subj_thread[s] << root - root.thread = @subj_thread[s] - else - @subj_thread[s] = root.thread - end + ## check to see if the subject is still the same (in the case + ## that we first added a child message with a different + ## subject) + if root.thread + unless @threads[key] == root.thread + if @threads[key] + root.thread.empty! + @threads[key] << root + root.thread = @threads[key] + else + @threads[key] = root.thread end - - else - ## to disable subject grouping, use the next line instead - ## (and the same above) - - ## this code is duplicated above. sorry! TODO: refactor - # Redwood::log "[2] for #{root}, subject '#{Message.normalize_subj(root.subj)}' has #{@subj_thread[Message.normalize_subj(root.subj)] ? 'a' : 'no'} thread" - - thread = (@subj_thread[Message.normalize_subj(root.subj)] ||= Thread.new) - #thread = (@subj_thread[root.id] ||= Thread.new) - - thread << root - root.thread = thread - # Redwood::log "[2] added #{root} to #{thread}" end + else + thread = @threads[key] + thread << root + root.thread = thread end ## last bit -- 2.45.2