This is the code.
- #/usr/bin/env python
- # -*- coding: utf-8 -*-
- ### globals
- import sys
- import re
- from bsddb import db
- from xml.sax import ContentHandler
- from xml.sax import make_parser
- class BaseHandler(ContentHandler):
- def __init__(self, parser):
- ContentHandler.__init__(self)
- self.db = db.DB()
- self.db.open("test_db", None, db.DB_HASH, db.DB_CREATE)
- self.parser = parser
- self.key_value_map = {}
- def setHandlers(self, titleHandler, textHandler):
- self.titleHandler = titleHandler
- self.textHandler = textHandler
- def startElement(self, name, attrs):
- if name == u"title":
- self.parser.setContentHandler(self.titleHandler)
- elif name == u"text":
- self.parser.setContentHandler(self.textHandler)
- class TitleHandler(ContentHandler):
- def __init__(self, baseHandler, parser):
- ContentHandler.__init__(self)
- self.baseHandler = baseHandler
- self.parser = parser
- def characters(self, char):
- if char:
- self.baseHandler.key_value_map["title"] = char.encode("utf-8")
- def endElement(self, name):
- if name == u"title":
- #give handler
- self.parser.setContentHandler(self.baseHandler)
- class TextHandler(ContentHandler):
- def __init__(self, baseHandler, parser):
- ContentHandler.__init__(self)
- self.baseHandler = baseHandler
- self.parser = parser
- def characters(self, char):
- r = re.compile(u"\[\[en:(.*?)\]\]")
- eng = r.findall(char)
- if eng:
- self.baseHandler.key_value_map["en"] = eng[0].encode("utf-8")
- def endElement(self, name):
- if name == u"text":
- self.baseHandler.db.put(self.baseHandler.key_value_map["title"]
- , self.baseHandler.key_value_map.get("en", ""))
- print self.baseHandler.key_value_map
- self.parser.setContentHandler(self.baseHandler)
- #initialize the map
- self.baseHandler.key_value_map = {}
- if __name__ == "__main__":
- parser = make_parser()
- baseHandler = BaseHandler(parser)
- titleHandler = TitleHandler(baseHandler, parser)
- textHandler = TextHandler(baseHandler, parser)
- baseHandler.setHandlers(titleHandler, textHandler)
- xmlFile = open('jawiki-latest-pages-meta-current.xml', 'r')
- parser.setContentHandler(baseHandler)
- parser.parse(xmlFile)
0 件のコメント:
コメントを投稿