-#!/bin/env ruby
+#!/usr/bin/env ruby
+require 'uri'
+require 'rubygems'
+require 'trollop'
require "sup"
+Thread.abort_on_exception = true # make debugging possible
+
class Float
def to_s; sprintf '%.2f', self; end
end
Time.now - startt
end
-def educate_user
- $stderr.puts <<EOS
-Loads messages into the Sup index, adding sources as needed to the
-source list.
+opts = Trollop::options do
+ version "sup-import (sup #{Redwood::VERSION})"
+ banner <<EOS
+Imports messages into the Sup index from one or more sources.
Usage:
sup-import [options] <source>*
-where <source>* is zero or more source descriptions (e.g., mbox
-filenames on disk).
-
-If the sources listed are not already in the Sup source list,
-they will be added to it, as parameterized by the following options:
- --archive: messages from these sources will not appear in the inbox
- --unusual: these sources will not be polled when the flag --the-usual
- is called
-
-Regardless of whether the sources are new or not, they will be polled,
-and any new messages will be added to the index, as parameterized by
-the following options:
- --force-archive: regardless of the source "archive" flag, any new
- messages found will not appear in the inbox.
- --force-read: any messages found will not be marked as new.
-
-The following options can also be specified:
- --the-usual: import new messages from all usual sources
- --rebuild: rebuild the index for the specified sources rather than
- just adding new messages. Useful if the sources
- have changed in any way *other* than new messages
- being added.
- --force-rebuild: force a rebuild of all messages in the inbox, not just
- ones that have changed. You probably won't need this
- unless William changes the index format.
- --optimize: optimize the index after adding any new messages.
- --help: don't do anything, just show this message.
-EOS
- exit
-end
-
-educate_user if ARGV.member? '--help'
-archive = ARGV.delete "--archive"
-unusual = ARGV.delete "--unusual"
-force_archive = ARGV.delete "--force-archive"
-force_read = ARGV.delete "--force-read"
-the_usual = ARGV.delete "--the-usual"
-rebuild = ARGV.delete "--rebuild"
-force_rebuild = ARGV.delete "--force-rebuild"
-optimize = ARGV.delete "--optimize"
+where <source>* is zero or more source URIs or mbox filenames. If no
+sources are given, imports messages from all sources marked as
+"usual".
-if(o = ARGV.find { |x| x =~ /^--/ })
- $stderr.puts "error: unknown option #{o}"
- educate_user
+Options are:
+EOS
+ opt :archive, "Automatically archive any imported messages."
+ opt :read, "Automatically mark as read any imported messages."
+ opt :verbose, "Print message ids as they're processed."
+ opt :optimize, "As the last stage of the import, optimize the index."
+ text <<EOS
+
+The following options allow sup-import to consider *all* messages in the
+source, not just new ones:
+EOS
+ opt :rebuild, "Scan over the entire source and update the index to account for any messages that have been deleted, altered, or moved from another source."
+ opt :full_rebuild, "Re-insert all messages in the source, not just ones that have changed or are new."
+ opt :start_at, "For rescan and rebuild, start at the given offset.", :type => :int
+ opt :overwrite_state, "For --full-rebuild, overwrite the message state to the default state for that source, obeying --archive and --read if given."
end
+Trollop::die :start_at, "must be non-negative" if (opts[:start_at] || 0) < 0
+Trollop::die :start_at, "requires either --rebuild or --full-rebuild" if opts[:start_at] && !(opts[:rebuild] || opts[:full_rebuild])
+Trollop::die :overwrite_state, "requires --full-rebuild" if opts[:overwrite_state] && !opts[:full_rebuild]
+Trollop::die :force_rebuild, "cannot be specified with --rebuild" if opts[:full_rebuild] && opts[:rebuild]
-puts "loading index..."
+Redwood::start
index = Redwood::Index.new
index.load
-pre_nm = index.size
-puts "loaded index of #{index.size} messages"
-
-sources = ARGV.map do |fn|
- source = index.source_for fn
- unless source
- source = Redwood::MBox::Loader.new(fn, 0, !unusual, !!archive)
- index.add_source source
+
+sources = ARGV.map do |uri|
+ uri = "mbox://#{uri}" unless uri =~ %r!://!
+ index.source_for uri or raise "Unknown source: #{uri}"
+end
+
+sources = index.usual_sources if sources.empty?
+
+if opts[:rebuild] || opts[:full_rebuild]
+ if opts[:start_at]
+ sources.each { |s| s.seek_to! opts[:start_at] }
+ else
+ sources.each { |s| s.reset! }
end
- source
end
-sources = (sources + index.usual_sources).uniq if the_usual
-sources.each { |s| s.reset! } if rebuild || force_rebuild
+last_update = start = Time.now
found = {}
-start = Time.now
begin
sources.each do |source|
- next if source.done?
- puts "loading from #{source}... "
- num = 0
- start_offset = nil
- source.each do |offset, labels|
- start_offset ||= offset
- labels -= [:inbox] if force_archive
- labels -= [:unread] if force_read
- begin
- m = Redwood::Message.new source, offset, labels
- if found[m.id]
- puts "skipping duplicate message #{m.id}"
- next
- else
- found[m.id] = true
- end
-
- m.remove_label :unread if m.mbox_status == "RO" unless force_read
- if (rebuild || force_rebuild) &&
- (docid, entry = index.load_entry_for_id(m.id)) && entry
- if force_rebuild || entry[:source_info].to_i != offset
- puts "replacing message #{m.id} labels #{entry[:label].inspect} (offset #{entry[:source_info]} => #{offset})"
- m.labels = entry[:label].split.map { |l| l.intern }
- num += 1 if index.update_message m, source, offset
- end
- else
- num += 1 if index.add_message m
- end
- rescue Redwood::MessageFormatError => e
- $stderr.puts "ignoring erroneous message at #{source}##{offset}: #{e.message}"
+ num_added = 0
+ num_updated = 0
+ puts "Scanning #{source}..."
+ Redwood::PollManager.add_new_messages_from source do |m, offset, entry|
+ ## if the entry exists on disk
+ if entry && !opts[:overwrite_state]
+ m.labels = entry[:label].split(/\s+/).map { |x| x.intern }
+ else
+ ## m.labels defaults to labels from the source
+ m.labels -= [:inbox] if opts[:archive]
+ m.labels -= [:unread] if opts[:read]
end
- if num % 1000 == 0 && num > 0
- elapsed = Time.now - start
- pctdone = (offset.to_f - start_offset) / (source.total.to_f - start_offset)
- remaining = (source.total.to_f - offset.to_f) * (elapsed.to_f / (offset.to_f - start_offset))
- puts "## #{num} (#{(pctdone * 100.0)}% done) read; #{elapsed.to_time_s} elapsed; est. #{remaining.to_time_s} remaining"
+
+ if Time.now - last_update > 60
+ last_update = Time.now
+ elapsed = last_update - start
+ pctdone = source.respond_to?(:pct_done) ? source.pct_done : 100.0 * (source.cur_offset.to_f - source.start_offset).to_f / (source.end_offset - source.start_offset).to_f
+ remaining = (100.0 - pctdone) * (elapsed.to_f / pctdone)
+ puts "## #{num} (#{pctdone}% done) read; #{elapsed.to_time_s} elapsed; est. #{remaining.to_time_s} remaining"
+ end
+
+ ## update if...
+ if entry.nil? # it's a new message; or
+ puts "Adding message at #{offset}, labels: #{m.labels * ' '}" if opts[:verbose]
+ num_added += 1
+ found[m.id] = true
+ m
+ elsif opts[:full_rebuild] || # we're updating everyone; or
+ (opts[:rebuild] && (entry[:source_id].to_i != source.id || entry[:source_info].to_i != offset)) # we're updating just the changed ones
+ puts "Updating message at #{offset} (from #{m.from.longname}, subject '#{m.subj}'), source #{entry[:source_id]} => #{source.id}, offset #{entry[:source_info]} => #{offset}, labels: {#{m.labels * ', '}}" if opts[:verbose]
+ num_updated += 1 unless found[m.id]
+ found[m.id] = true
+ m
+ else
+ found[m.id] = true
+ nil
end
end
- puts "loaded #{num} messages" unless num == 0
+ puts "Added #{num_added}, updated #{num_updated} messages from #{source}."
end
ensure
+ puts "Saving index and sources..."
index.save
+ Redwood::finish
end
-if rebuild || force_rebuild
- puts "deleting missing messages from the index..."
- numdel = 0
+## delete any messages in the index that claim they're from one of
+## these sources, but that we didn't see.
+##
+## kinda crappy code here, because we delve directly into the Ferret
+## API.
+##
+## TODO: move this to Index, i suppose.
+if opts[:rebuild] || opts[:full_rebuild]
+ puts "Deleting missing messages from the index..."
+ numdel = num = 0
sources.each do |source|
raise "no source id for #{source}" unless source.id
- index.index.search_each("source_id:#{source.id}", :limit => :all) do |docid, score|
+ q = "+source_id:#{source.id}"
+ q += " +source_info: >= #{opts[:start_at]}" if opts[:start_at]
+ num += index.index.search_each(q, :limit => :all) do |docid, score|
mid = index.index[docid][:message_id]
+# puts "got #{mid}"
next if found[mid]
- puts "deleting #{mid}"
+ puts "Deleting #{mid}" if opts[:verbose]
index.index.delete docid
numdel += 1
end
end
- puts "deleted #{numdel} messages"
+ puts "Deleted #{numdel} / #{num} messages"
end
-if optimize
- puts "optimizing index..."
+if opts[:optimize]
+ puts "Optimizing index..."
optt = time { index.index.optimize }
- puts "optimized index of size #{index.size} in #{optt}s."
+ puts "Optimized index of size #{index.size} in #{optt}s."
end