2008年8月8日金曜日

wikipedia xml to Berkeley DB

I wanna get titles and link to wikipedia English pages from dump data of wikipedia.
This is the code.


#/usr/bin/env python
# -*- coding: utf-8 -*-

### globals
import sys
import re
from bsddb import db
from xml.sax import ContentHandler
from xml.sax import make_parser

class BaseHandler(ContentHandler):

def __init__(self, parser):
ContentHandler.__init__(self)
self.db = db.DB()
self.db.open("test_db", None, db.DB_HASH, db.DB_CREATE)

self.parser = parser
self.key_value_map = {}

def setHandlers(self, titleHandler, textHandler):
self.titleHandler = titleHandler
self.textHandler = textHandler

def startElement(self, name, attrs):

if name == u"title":
self.parser.setContentHandler(self.titleHandler)
elif name == u"text":
self.parser.setContentHandler(self.textHandler)


class TitleHandler(ContentHandler):
def __init__(self, baseHandler, parser):
ContentHandler.__init__(self)
self.baseHandler = baseHandler
self.parser = parser


def characters(self, char):
if char:
self.baseHandler.key_value_map["title"] = char.encode("utf-8")

def endElement(self, name):
if name == u"title":
#give handler
self.parser.setContentHandler(self.baseHandler)


class TextHandler(ContentHandler):

def __init__(self, baseHandler, parser):
ContentHandler.__init__(self)
self.baseHandler = baseHandler
self.parser = parser

def characters(self, char):
r = re.compile(u"\[\[en:(.*?)\]\]")
eng = r.findall(char)
if eng:
self.baseHandler.key_value_map["en"] = eng[0].encode("utf-8")

def endElement(self, name):
if name == u"text":
self.baseHandler.db.put(self.baseHandler.key_value_map["title"]
, self.baseHandler.key_value_map.get("en", ""))
print self.baseHandler.key_value_map
self.parser.setContentHandler(self.baseHandler)
#initialize the map
self.baseHandler.key_value_map = {}

if __name__ == "__main__":
parser = make_parser()
baseHandler = BaseHandler(parser)
titleHandler = TitleHandler(baseHandler, parser)
textHandler = TextHandler(baseHandler, parser)
baseHandler.setHandlers(titleHandler, textHandler)
xmlFile = open('jawiki-latest-pages-meta-current.xml', 'r')
parser.setContentHandler(baseHandler)
parser.parse(xmlFile)



0 件のコメント: