Attachment 'tikihtml2moinmoin.py'

Download

   1 #!/usr/local/bin/python
   2 
   3 """
   4  converts tiki html to MoinMoin markup
   5  author: Daniela Nicklas <dani@miracle-solutions.de>
   6 """
   7 import sys
   8 from HTML2MoinMoin import HTML2MoinMoin
   9 import string
  10 import os
  11 import codecs
  12 import StringIO
  13 import re
  14 sys.path.append('Path to MoinMoin-Libs')
  15 from MoinMoin.wikiutil import isStrictWikiname
  16 
  17 # global variables
  18 sourcedir = "pages"
  19 targetdir = "text"
  20 # Pages that start with key go to Category
  21 page2category = {
  22     'NameInTiki': 'CategoryNewName',
  23     'AnothernameInTiki': 'CategoryAnotherOrSameNewName'}
  24 
  25 # Output Ignore
  26 class devnull:
  27     def write(self, data):
  28         return
  29     
  30 # HTML Parser
  31 class TikiHTML2MoinMoin(HTML2MoinMoin):
  32     start_tags = {
  33         "a"     : "[",
  34         "b"     : "'''",
  35         "em"    : "''",
  36         "tt"    : "{{{",
  37         "pre"   : "\n{{{",
  38         "p"     : "\n\n",
  39         "br"    : "\n\n",
  40         "h1"    : "\n\n= ",
  41         "h2"    : "\n\n== ",
  42         "h3"    : "\n\n=== ",
  43         "h4"    : "\n\n==== ",
  44         "h5"    : "\n\n===== ",
  45         "hr"    : "\n----\n",
  46         "title" : "",
  47         "table" : "\n",
  48         "tr"    : "",
  49         "td"    : "||"
  50         }
  51 
  52     end_tags = {
  53         "a"     : ']',
  54         "b"     : "'''",
  55         "em"    : "''",
  56         "tt"    : "}}}",
  57         "pre"   : "}}}\n",
  58         "p"     : "",
  59         "h1"    : " =\n\n",
  60         "h2"    : " ==\n\n",
  61         "h3"    : " ===\n\n",
  62         "h4"    : " ====\n\n",
  63         "h5"    : " =====\n\n",
  64         "table" : "\n",
  65         "title" : "",
  66         "tr"    : "||\n",
  67         "dt"    : ":: "
  68         }
  69     def __init__(self):
  70         HTML2MoinMoin.__init__(self)
  71         self.title = 0
  72         self.heading = 0
  73         self.head = 0
  74         self.linebreaks = 1
  75         self.tablecount = 0
  76         self.devnull = devnull()
  77         self.outputbackup = self.output
  78         self.div_mode = []
  79         self.a_mode = []
  80         self.tikipageurl = 'tiki-index.php?page='
  81         self.tikiediturl = 'tiki-editpage.php?page='
  82         self.tikicategoryurl = 'tiki-browse_categories.php?parentId='
  83         self.tikicategory = {
  84             1 : 'NameInTiki',
  85             4 : 'AnotherNameIn'
  86             }
  87         self.linkreplacements = {
  88             'HomePage': 'StartSeite',
  89             'UserPageYourPage': 'YourName'
  90             }
  91 
  92             
  93 
  94     def set_ignore(self):
  95         if self.output != self.devnull:
  96             self.outputbackup = self.output
  97             self.output = self.devnull
  98             
  99     def unset_ignore(self):
 100         self.output = self.outputbackup
 101 
 102 
 103 
 104     def do_html_start(self,attrs,tag):
 105         self.set_ignore()
 106         
 107     def do_table_end(self,tag):
 108         self.tablecount = self.tablecount + 1
 109         if self.tablecount == 1:
 110             self.unset_ignore()
 111             
 112     def do_h1_start(self,attrs,tag):
 113         self.heading = 1
 114         self.write(self.start_tags[tag])
 115 
 116     def do_h1_end(self,tag):
 117         self.heading = 0
 118         self.write(self.end_tags[tag])
 119 
 120     def do_a_start(self,attrs,tag):
 121         if self.heading:
 122             self.a_mode.append('heading')
 123         else:
 124             href = ''
 125             at_class = ''
 126             for att in attrs:
 127                 if (att[0] == 'href'):
 128                     href= att[1]
 129                 if (att[0] == 'class'):
 130                     at_class = att[1]
 131             if at_class == 'wikicache':
 132                 self.set_ignore()
 133                 self.a_mode.append('cache')
 134             elif href.find(self.tikipageurl) != -1:
 135                 href = href[:href.find('&')]
 136                 href = href.replace(self.tikipageurl,'')
 137                 for key, value in self.linkreplacements.items():
 138                     if href == key:
 139                         href = value
 140                         exit
 141                 self.write(self.start_tags[tag]+'wiki:'+href+' ')
 142                 self.a_mode.append(self.end_tags[tag])
 143             elif href.find(self.tikiediturl) != -1:
 144                 self.write('- FixMe/EditLink -')
 145                 self.set_ignore()
 146                 self.a_mode.append('fixme')
 147             elif href.find(self.tikicategoryurl) != -1:
 148                 href = href.replace(self.tikicategoryurl, '')
 149                 number = int(href[:href.find('&')])
 150                 self.write(self.start_tags[tag]+'wiki:'+self.tikicategory.get(number,'CategoryMissmatch')+' ')
 151                 self.a_mode.append(self.end_tags[tag])
 152                 if not self.tikicategory.has_key(number):
 153                     print "CategoryMissmatch: %s" % number
 154             elif href != '':
 155                 self.write(self.start_tags[tag])
 156                 self.write(href + " ")
 157                 self.a_mode.append(self.end_tags[tag])
 158             
 159             
 160     def do_a_end(self,tag):
 161         mode = self.a_mode.pop()
 162         if mode == 'fixme' or mode == 'cache':
 163             self.unset_ignore()
 164         elif not self.heading:
 165             self.write(mode)
 166 
 167     def do_div_start(self,attrs,tag):
 168         for att in attrs:
 169             if att == ('class', 'titlebar'):
 170                 self.heading = 1
 171                 self.write(self.start_tags["h3"])
 172                 self.div_mode.append(self.end_tags["h3"])
 173             else:
 174                 self.div_mode.append("")
 175 
 176     def do_div_end(self,tag):
 177         if self.heading:
 178             self.heading = 0
 179         self.write(self.div_mode.pop())
 180 
 181     def do_p_start(self,attrs,tag):
 182         for att in attrs:
 183             if att == ('class', 'editdate'):
 184                 self.set_ignore()
 185 
 186     def handle_data(self, data):
 187         data = data.replace("\r", "")
 188         if self.preformatted:
 189             self.write(data)
 190         else:
 191             self.write(data.replace("\n", " "))
 192 
 193             
 194                 
 195 # Main flow
 196 
 197 def main():
 198 #    sys.setdefaultencoding('iso-8859-1')
 199     # look for source directory
 200     if not os.access(sourcedir, os.F_OK):
 201         print "%s is not accessable"%sourcedir
 202         return ''
 203     else:
 204         sourcelist = os.listdir(sourcedir)
 205         
 206     # create target directory (if necessary)
 207     if not os.access(targetdir, os.F_OK):
 208         os.mkdir(targetdir)
 209 
 210     # first pass: transform htmp
 211     for sourcefile in sourcelist:
 212         print sourcefile,
 213         # open sourcefile
 214         sf = codecs.open(sourcedir+'/'+sourcefile,'r','UTF-8')
 215         htmldata = sf.read()
 216         htmldata = htmldata.encode('iso-8859-1','replace')
 217         sf.close()
 218         print ' .',
 219         # parse it and write output to target file (tf)
 220         p = TikiHTML2MoinMoin()
 221         tf = StringIO.StringIO()
 222         p.output = tf
 223         p.feed(htmldata)
 224         p.close()
 225         
 226         print '.',
 227         
 228         # second pass: delete white spaces
 229         wikidata = tf.getvalue()
 230         wikidata=wikidata.replace('\n\n\n','\n')
 231         
 232         # open targetfile
 233         tf = open(targetdir+'/'+sourcefile[:-5], 'w')
 234         
 235         # third pass: purify wiki-links and write to file
 236         wikinamepattern = r'\[wiki\:((?P<wikilink>.*?) (?P<label>.*?))\]'
 237         matches = re.finditer(wikinamepattern, wikidata)
 238         lastend = 0
 239         for match in matches:
 240             tf.write(wikidata[lastend:match.start()])
 241             mdict = match.groupdict()
 242             wikilink = mdict['wikilink']
 243             label = mdict['label']
 244             if wikilink == label:
 245                 if isStrictWikiname(wikilink):
 246                     tf.write(wikilink)
 247                 else:
 248                     tf.write('["%s"]'%wikilink)
 249             else:
 250                 tf.write(match.group())
 251             lastend = match.end()
 252         tf.write(wikidata[lastend:])
 253 
 254         # put pages in categorys
 255         for key, value in page2category.items():
 256             if sourcefile.find(key) == 0:
 257                 if wikidata.find(value) == -1:
 258                     tf.write('\n'+value)
 259                     
 260         # close targetfile
 261         tf.close
 262         print '. ->' + sourcefile[:-5]
 263 
 264 main()

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2003-12-07 18:15:54, 3.7 KB) [[attachment:HTML2MoinMoin.py]]
  • [get | view] (2003-12-07 18:15:54, 1.2 KB) [[attachment:gettiki_html.py]]
  • [get | view] (2003-12-07 18:15:54, 7.7 KB) [[attachment:tikihtml2moinmoin.py]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.