X-Git-Url: https://git.notmuchmail.org/git?p=notmuch;a=blobdiff_plain;f=xapian-dump.cc;h=39c1c3404b02c13e6d01d1c3dfbe944503d63778;hp=0ab5e32c7cb6f4f1810a07ec026e903f0bc38a26;hb=9bc4253fa804b62ff31e8de82a139b2cb12b118f;hpb=26795d64e6150b543d850c1e882a9d1395b58d9e diff --git a/xapian-dump.cc b/xapian-dump.cc index 0ab5e32c..39c1c340 100644 --- a/xapian-dump.cc +++ b/xapian-dump.cc @@ -1,4 +1,4 @@ -/* xapian-dump: Dump all document IDs from a Xapian database +/* xapian-dump: Create a textual dump of a Xapian database. * * Copyright © 2009 Carl Worth * @@ -18,38 +18,183 @@ * Author: Carl Worth */ +/* Currently the dumped data includes: + * + * All document IDs + * + * And for each document ID: + * + * Document data + * All document terms + * All document values + */ + #include #include +#include #include using namespace std; +vector UNSERIALIZE; + +unsigned int MAX_TERMS = 0; + +static void +print_escaped_string (const char *s) +{ + printf ("\""); + + while (*s) { + if (*s == '"') + printf ("\\"); + printf ("%c", *s); + s++; + } + + printf ("\""); +} + +static void +print_document_terms (Xapian::Document doc) +{ + Xapian::TermIterator it; + unsigned int i; + + printf (" {\n"); + + for (it = doc.termlist_begin (), i = 0; + it != doc.termlist_end (); + it++, i++) + { + printf (" "); + print_escaped_string ((*it).c_str()); + printf (",\n"); + } + + for ( ; i < MAX_TERMS; i++) + printf (" \"\",\n"); + + printf (" },\n"); +} + +static int +vector_int_contains (vector v, int i) +{ + vector::iterator result; + + result = find (v.begin(), v.end(), i); + + return result != v.end(); +} + +static void +print_document_values (Xapian::Document doc) +{ + Xapian::ValueIterator i; + int value_no, value_int; + double value_float; + + for (i = doc.values_begin (); i != doc.values_end (); i++) { + value_no = i.get_valueno(); + + printf (" "); + + if (vector_int_contains (UNSERIALIZE, value_no)) { + value_float = Xapian::sortable_unserialise (*i); + value_int = value_float; + if (value_int == value_float) + printf ("%d", value_int); + else + printf ("\"%f\"", value_float); + } else { + print_escaped_string ((*i).c_str ()); + } + + printf (",\n"); + } + +} + +static void +print_document (Xapian::Database db, Xapian::docid id) +{ + Xapian::Document doc; + + printf ("{\n"); + + doc = db.get_document (id); + + printf (" \"%s\",\n", doc.get_data ().c_str()); + + print_document_terms (doc); + + print_document_values (doc); + + printf ("},\n"); +} + int main (int argc, char *argv[]) { const char *database_path; + int i; if (argc < 2) { - fprintf (stderr, "Usage: %s \n", + fprintf (stderr, "Usage: %s [value_nos...]\n", argv[0]); + fprintf (stderr, "Dumps data from the given database.\n"); + fprintf (stderr, "The values corresponding to any value numbers given on the command line\n"); + fprintf (stderr, "will be unserialized to an before being printed.\n"); exit (1); } database_path = argv[1]; - try { + UNSERIALIZE = vector (); + for (i = 2; i < argc; i++) + UNSERIALIZE.push_back (atoi (argv[i])); + + try { Xapian::Database db; Xapian::PostingIterator i; Xapian::docid doc_id; db = Xapian::Database (database_path); + for (i = db.postlist_begin (""); i != db.postlist_end (""); i++) { + Xapian::Document doc; + doc_id = *i; - printf ("Found document %u\n", doc_id); + + doc = db.get_document (doc_id); + + if (doc.termlist_count () > MAX_TERMS) + MAX_TERMS = doc.termlist_count (); } + printf ("#define MAX_TERMS %d\n\n", MAX_TERMS); + + printf ("typedef struct {\n" + " char data[255];\n" + " char terms[MAX_TERMS][255];\n" + " char message_id[255];\n" + " char thread_id[4096];\n" + " time_t time;\n" + "} document_dump_t;\n\n"); + + printf ("document_dump_t dump[] = {\n"); + + for (i = db.postlist_begin (""); i != db.postlist_end (""); i++) { + doc_id = *i; + + print_document (db, doc_id); + } + + printf ("};\n"); + } catch (const Xapian::Error &error) { cerr << "A Xapian exception occurred: " << error.get_msg () << endl; exit (1);