Attachment 'oo2txt.py'

Download

   1 #!/usr/bin/python
   2 #
   3 # simple OO writer to text. just strips all markup, useable for search engines
   4 # this is an example with only a bit error handling
   5 #
   6 # Copyright (c) 2004 by Oliver Graf <ograf@bitart.de>
   7 #
   8 
   9 import sys, zipfile, re
  10 import xml.parsers.expat
  11 from htmlentitydefs import name2codepoint
  12 
  13 class OOparser:
  14 	
  15 	def __init__(self, entities=None):
  16 		self.entities={}
  17 		self.entities.update(name2codepoint)
  18 		if entities is not None:
  19 			self.entities.update(entities)
  20 
  21 	def reset(self):
  22 		self.data=u''
  23 		self.parser=p=xml.parsers.expat.ParserCreate()
  24 		p.CharacterDataHandler         = self._char_data
  25 		p.DefaultHandler               = self._default_data
  26 		p.StartElementHandler          = self._ignore
  27 		p.EndElementHandler            = self._ignore
  28 		p.XmlDeclHandler               = self._ignore
  29 		p.StartDoctypeDeclHandler      = self._ignore
  30 		p.EndDoctypeDeclHandler        = self._ignore
  31 		p.ElementDeclHandler           = self._ignore
  32 		p.AttlistDeclHandler           = self._ignore
  33 		p.ProcessingInstructionHandler = self._ignore
  34 		p.UnparsedEntityDeclHandler    = self._ignore
  35 		p.EntityDeclHandler            = self._ignore
  36 		p.NotationDeclHandler          = self._ignore
  37 		p.StartNamespaceDeclHandler    = self._ignore
  38 		p.EndNamespaceDeclHandler      = self._ignore
  39 		p.CommentHandler               = self._ignore
  40 		p.StartCdataSectionHandler     = self._ignore
  41 		p.EndCdataSectionHandler       = self._ignore
  42 		p.ExternalEntityRefHandler     = self._ignore
  43 		return p
  44 
  45 	def parse(self, data):
  46 		p=self.reset()
  47 		p.Parse(data,1)
  48 		return self.data
  49 
  50 	def _char_data(self, data):
  51 		self.data+=data
  52 
  53 	def _default_data(self, data):
  54 		# handle entities! everything inside should be unicode
  55 		if data[0]=='&' and data[-1]==';':
  56 			if self.entities.has_key(data[1:-1]):
  57 				self.data+=unichr(self.entities[data[1:-1]])
  58 				return
  59 		self.data+=data
  60 
  61 	def _ignore(self, *args, **kwargs):
  62 		pass
  63 
  64 def oo2txt(filename):
  65 	z=zipfile.ZipFile(filename,'r')
  66 	data=z.read('content.xml')
  67 	p=OOparser()
  68 	words=re.split(r'\W+',p.parse(data).encode('ISO-8859-1'))
  69 	print '\n'.join(words)
  70 
  71 if __name__=='__main__':
  72 	if len(sys.argv)!=2:
  73 		sys.stderr.write('Usage: %s FILENAME\n'%(sys.argv[0]))
  74 		sys.exit(1)
  75 	oo2txt(sys.argv[1])

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2004-08-25 14:35:47, 15.1 KB) [[attachment:moin-1.3_attsearch.diff]]
  • [get | view] (2004-06-21 13:45:13, 2.2 KB) [[attachment:oo2txt.py]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.