From: José Fonseca Date: Fri, 25 Jul 2008 09:32:01 +0000 (+0900) Subject: Add a script to convert logs from XML to plain text. X-Git-Url: https://git.notmuchmail.org/git?a=commitdiff_plain;h=a71090e5dfa7b901deee5bf3ed34b6c1a33f64ef;p=apitrace Add a script to convert logs from XML to plain text. XSLT processors read the whole log to memory, so they are not adequate for large logs. --- diff --git a/xml2txt.py b/xml2txt.py new file mode 100755 index 0000000..2890129 --- /dev/null +++ b/xml2txt.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python +############################################################################# +# +# Copyright 2008 Tungsten Graphics, Inc. +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see . +# +############################################################################# + + +import sys +import xml.parsers.expat + + +ELEMENT_START, ELEMENT_END, CHARACTER_DATA, EOF = range(4) + + +class XmlToken: + + def __init__(self, type, name_or_data, attrs = None, line = None, column = None): + assert type in (ELEMENT_START, ELEMENT_END, CHARACTER_DATA, EOF) + self.type = type + self.name_or_data = name_or_data + self.attrs = attrs + self.line = line + self.column = column + + def __str__(self): + if self.type == ELEMENT_START: + return '<' + self.name_or_data + ' ...>' + if self.type == ELEMENT_END: + return '' + if self.type == CHARACTER_DATA: + return self.name_or_data + if self.type == EOF: + return 'end of file' + assert 0 + + +class XmlTokenizer: + """Expat based XML tokenizer.""" + + def __init__(self, fp, strip = True): + self.fp = fp + self.tokens = [] + self.index = 0 + self.final = False + self.strip = strip + + self.parser = xml.parsers.expat.ParserCreate() + self.parser.StartElementHandler = self.handle_element_start + self.parser.EndElementHandler = self.handle_element_end + self.parser.CharacterDataHandler = self.handle_character_data + + def handle_element_start(self, name, attributes): + line, column = self.pos() + token = XmlToken(ELEMENT_START, name, attributes, line, column) + self.tokens.append(token) + + def handle_element_end(self, name): + line, column = self.pos() + token = XmlToken(ELEMENT_END, name, None, line, column) + self.tokens.append(token) + + def handle_character_data(self, data): + if self.strip: + data = data.strip() + if not data: + return + + line, column = self.pos() + token = XmlToken(CHARACTER_DATA, data, None, line, column) + self.tokens.append(token) + + def next(self): + size = 16*1024 + while self.index >= len(self.tokens) and not self.final: + self.tokens = [] + self.index = 0 + data = self.fp.read(size) + self.final = len(data) < size + self.parser.Parse(data, self.final) + if self.index >= len(self.tokens): + line, column = self.pos() + token = XmlToken(EOF, None, None, line, column) + else: + token = self.tokens[self.index] + self.index += 1 + return token + + def pos(self): + return self.parser.CurrentLineNumber, self.parser.CurrentColumnNumber + + +class TokenMismatch(Exception): + + def __init__(self, expected, found): + self.expected = expected + self.found = found + + def __str__(self): + return '%u:%u: %s expected, %s found' % (self.found.line, self.found.column, str(self.expected), str(self.found)) + + + +class XmlParser: + """Base XML document parser.""" + + def __init__(self, fp): + self.tokenizer = XmlTokenizer(fp) + self.consume() + + def consume(self): + self.token = self.tokenizer.next() + + def match_element_start(self, name): + return self.token.type == ELEMENT_START and self.token.name_or_data == name + + def match_element_end(self, name): + return self.token.type == ELEMENT_END and self.token.name_or_data == name + + def element_start(self, name): + while self.token.type == CHARACTER_DATA: + self.consume() + if self.token.type != ELEMENT_START: + raise TokenMismatch(XmlToken(ELEMENT_START, name), self.token) + if self.token.name_or_data != name: + raise TokenMismatch(XmlToken(ELEMENT_START, name), self.token) + attrs = self.token.attrs + self.consume() + return attrs + + def element_end(self, name): + while self.token.type == CHARACTER_DATA: + self.consume() + if self.token.type != ELEMENT_END: + raise TokenMismatch(XmlToken(ELEMENT_END, name), self.token) + if self.token.name_or_data != name: + raise TokenMismatch(XmlToken(ELEMENT_END, name), self.token) + self.consume() + + def character_data(self): + if self.token.type == CHARACTER_DATA: + data = self.token.name_or_data + self.consume() + else: + data = '' + return data + + +class TraceParser(XmlParser): + + def parse(self): + self.element_start('trace') + while self.token.type != ELEMENT_END: + self.parse_call() + self.element_end('trace') + + def parse_call(self): + attrs = self.element_start('call') + name = attrs['name'] + args = [] + ret = None + while self.token.type == ELEMENT_START: + if self.token.name_or_data == 'arg': + arg = self.parse_arg() + args.append(arg) + elif self.token.name_or_data == 'ret': + ret = self.parse_ret() + else: + raise TokenMismatch(" or ", self.token) + self.element_end('call') + + sys.stdout.write(name) + sys.stdout.write('(' + ', '.join([name + ' = ' + value for name, value in args]) + ')') + if ret is not None: + sys.stdout.write(' = ' + ret) + sys.stdout.write('\n') + + def parse_arg(self): + attrs = self.element_start('arg') + name = attrs['name'] + value = self.parse_value() + self.element_end('arg') + + return name, value + + def parse_ret(self): + attrs = self.element_start('ret') + value = self.parse_value() + self.element_end('ret') + + return value + + def parse_value(self): + if self.token.type == CHARACTER_DATA: + return self.character_data() + if self.token.type == ELEMENT_START: + if self.token.name_or_data == 'elem': + return self.parse_elems() + if self.token.name_or_data == 'ref': + return self.parse_ref() + raise TokenMismatch(", , or text", self.token) + + def parse_elems(self): + elems = [self.parse_elem()] + while self.token.type != ELEMENT_END: + elems.append(self.parse_elem()) + return '{' + ', '.join([name + ' = ' + value for name, value in elems]) + '}' + + def parse_elem(self): + attrs = self.element_start('elem') + name = attrs['name'] + value = self.parse_value() + self.element_end('elem') + + return name, value + + def parse_ref(self): + attrs = self.element_start('ref') + if self.token.type != ELEMENT_END: + value = '&' + self.parse_value() + else: + value = attrs['addr'] + self.element_end('ref') + + return value + + +def main(): + for arg in sys.argv[1:]: + parser = TraceParser(open(arg, 'rt')) + parser.parse() + + +if __name__ == '__main__': + main()