ScriptMarket/HTMLImportScript - MoinMoin

Quick and dirty script to convert reasonably clean and well-formatted HTML (body only) to MoinMoin markup. The output will almost certainly need touch-up editing afterwards, see the main comment for some details.

   1 #!/usr/bin/python
   2 
   3 # script accepting as input reasonably-formatted HTML/XHTML BODY, and
   4 # producing moinmoin markup as output. the markup will need some manual touch
   5 # up editing afterwads. things to look for:
   6 #   * H1 and H2 both map to top-level headings, with the expectation that
   7 #     H1 should be used only once for the page title (and hence should be edited
   8 #     out of the moin content). this will probably need manual adjustment
   9 #   * markup like '' or ''' might have gratuitous spaces around it
  10 #   * nested <B><STRONG> etc. won't work as expected (''' ''' foo ''' ''')
  11 #   * tables with empty cells, might produce |||| (needs a space)
  12 #   * inline comments don't work too well, best to move them out of the para
  13 #   * anchor tags are delayed until after a heading, might be better moved just
  14 #     before it
  15 
  16 # andrewb@cse.unsw.edu.au disclaims all resonsibility for this ugly hack
  17 
  18 import sys, os.path
  19 from HTMLParser import HTMLParser, HTMLParseError
  20 
  21 TAG_MAP = {
  22     'h1'    : ('\n\n= ', ' =\n'),
  23     'h2'    : ('\n\n= ', ' =\n'),
  24     'h3'    : ('\n\n== ', ' ==\n'),
  25     'h4'    : ('\n\n=== ', ' ===\n'),
  26     'h5'    : ('\n\n==== ', ' ====\n'),
  27     'h6'    : ('\n\n===== ', ' =====\n'),
  28     'em'    : (" ''", "'' "),
  29     'i'     : (" ''", "'' "),
  30     'tt'    : (" `", "` "),
  31     'strong': (" '''", "''' "),
  32     'bold'  : (" '''", "''' "),
  33     'b'     : (" '''", "''' "),
  34     'p'     : ('\n', '\n'),
  35     'code'  : ('{{{', '}}}'),
  36 }
  37 
  38 ENTITY_MAP = {
  39     'nbsp'  : ' ',
  40     'lt'    : '<',
  41     'gt'    : '>',
  42     'amp'   : '&',
  43     'ndash' : '-',
  44     'mdash' : '--',
  45     'quot'  : '"',
  46 }
  47 
  48 # list types (FIXME: does python have an enum?)
  49 ORDERED = True
  50 UNORDERED = False
  51 
  52 # max line length to output
  53 MAXLINELEN = 76
  54 
  55 class MoinConverter(HTMLParser):
  56     def __init__(self, out):
  57         HTMLParser.__init__(self)
  58         self.out = out
  59         self.listtype = []
  60         self.indent = 0
  61         self.linepos = 0
  62         self.preformatted = False
  63         self.last_word_space = False
  64         self.in_heading = False
  65         self.no_newline = 0
  66         self.in_a = False
  67         self.queued_tags = []
  68 
  69     def newline(self):
  70         if (self.no_newline == 0):
  71             space = " " * self.indent
  72             self.out.write('\n' + space)
  73             self.linepos = len(space)
  74             self.last_word_space = False
  75 
  76     def output(self, word, space = False):
  77         if self.in_heading and not space:
  78             self.queued_tags.append(word)
  79             return
  80         while word.find('\n') != -1:
  81             (w1, w2) = word.split('\n', 1)
  82             if self.linepos + len(w1) > MAXLINELEN:
  83                 self.out.write('\n')
  84             self.out.write(w1)
  85             self.newline()
  86             word = w2
  87         if word == '':
  88             return
  89         if (self.last_word_space and space
  90             and self.linepos + len(word) < MAXLINELEN):
  91             self.out.write(" " + word)
  92             self.linepos += len(word) + 1
  93         else:
  94             if self.last_word_space and space:
  95                 if self.no_newline == 0:
  96                     self.newline()
  97                 else:
  98                     self.out.write(" ")
  99             elif self.linepos + len(word) > MAXLINELEN:
 100                 self.newline()
 101             self.out.write(word)
 102             self.linepos += len(word)
 103         self.last_word_space = space
 104 
 105     def handle_starttag(self, tag, attrlist):
 106         attrs = {}
 107         for (key, value) in attrlist:
 108             attrs[key] = value
 109 
 110         if TAG_MAP.has_key(tag):
 111             (start, end) = TAG_MAP[tag]
 112             self.output(start)
 113             if tag[0] == 'h':
 114                 self.in_heading = True
 115         elif tag == "ol":
 116             self.listtype.append(ORDERED)
 117             self.indent += 1
 118         elif tag == "ul":
 119             self.listtype.append(UNORDERED)
 120             self.indent += 1
 121         elif tag == "li":
 122             assert(self.listtype != [])
 123             space = " " * len(self.listtype)
 124             if self.listtype[-1] == ORDERED:
 125                 self.output("\n1.", True)
 126             else:
 127                 self.output("\n*", True)
 128         elif tag == "a":
 129             if attrs.has_key('href'):
 130                 url = attrs['href']
 131                 if (url.startswith('http://') or url.startswith('https://')
 132                     or url.startswith('ftp://') or url.startswith('mailto:')
 133                     or url.startswith('#')):
 134                     self.output('[%s' % url, True)
 135                 else:
 136                     # guess it's a relative URL, and make an attachment for it
 137                     attachname = os.path.basename(url)
 138                     self.output('[attachment:%s' % attachname, True)
 139                 self.in_a = True
 140                 self.no_newline += 1
 141             elif attrs.has_key('name'):
 142                 self.output('[[Anchor(%s)]]' % attrs['name'])
 143         elif tag == "tr":
 144             self.newline()
 145             self.no_newline += 1
 146         elif tag == "th":
 147             self.output("||")
 148             if attrs.has_key("style"):
 149                 self.output('<style="%s">' % attrs["style"])
 150             self.output("'''")
 151         elif tag == "hr":
 152             self.output("\n\n----\n")
 153         elif tag == "br":
 154             self.output("[[BR]]\n")
 155         elif tag == "td":
 156             self.output("||")
 157             if attrs.has_key("style"):
 158                 self.output('<style="%s">' % attrs["style"])
 159         elif tag == "pre":
 160             self.output('{{{\n')
 161             self.preformatted = True
 162         elif tag in ["table"]:
 163             pass
 164         else:
 165             sys.stderr.write("Warning: ignoring <%s %s>\n" % (tag, attrlist))
 166 
 167     def handle_startendtag(self, tag, attrlist):
 168         attrs = {}
 169         for (key, value) in attrlist:
 170             attrs[key] = value
 171 
 172         if tag == "hr":
 173             self.output("\n\n----\n")
 174         elif tag == "br":
 175             self.output("[[BR]]\n")
 176         elif tag == "a" and attrs.has_key('name'):
 177             self.output('[[Anchor(%s)]]' % attrs['name'])
 178         else:
 179             sys.stderr.write("Warning: ignoring <%s />\n" % tag)
 180 
 181     def handle_endtag(self, tag):
 182         if TAG_MAP.has_key(tag):
 183             (start, end) = TAG_MAP[tag]
 184             if tag[0] == 'h':
 185                 self.in_heading = False
 186             self.output(end)
 187             if tag[0] == 'h':
 188                 for word in self.queued_tags:
 189                     self.output(word, False)
 190                 self.queued_tags = []
 191         elif tag in ["ol", "ul"]:
 192             assert(self.listtype != [])
 193             self.listtype = self.listtype[:-1]
 194             self.indent -= 1
 195         elif tag == "a":
 196             if self.in_a:
 197                 self.output('] ')
 198                 self.no_newline -= 1
 199                 self.in_a = False
 200         elif tag == "tr":
 201             self.output("||")
 202             self.no_newline -= 1
 203 #            self.newline()
 204         elif tag == "th":
 205             self.output("'''")
 206         elif tag == "pre":
 207             self.output('}}}')
 208             self.preformatted = False
 209         elif tag in ["li", "table", "td"]:
 210             pass
 211 
 212     def handle_data(self, data):
 213         if self.preformatted:
 214             self.out.write(data)
 215         else:
 216             for word in data.split():
 217                 self.output(word, True)
 218 
 219     def handle_charref(self, name):
 220         sys.stderr.write("Warning: ignoring &#%s;\n" % name)
 221 
 222     def handle_entityref(self, name):
 223         if ENTITY_MAP.has_key(name):
 224             self.output(ENTITY_MAP[name], False)
 225         else:
 226             sys.stderr.write("Warning: ignoring &%s;\n" % name)
 227 
 228     def handle_comment(self, text):
 229         for line in text.splitlines():
 230             self.out.write("\n## %s" % line)
 231         self.newline()
 232 
 233 def main(argv):
 234     if len(argv) <= 1:
 235         f = sys.stdin
 236     else:
 237         try:
 238             f = file(argv[1])
 239         except IOError, e:
 240             sys.stderr.write("Error: %s\n" % e)
 241             sys.exit(1)
 242     parser = MoinConverter(sys.stdout)
 243     try:
 244         parser.feed(f.read())
 245     except HTMLParseError, e:
 246         sys.stderr.write("Parse error: %d: %s\n" % (e.lineno, e.msg))
 247         sys.exit(1)
 248 
 249 if __name__ == "__main__":
 250     sys.exit(main(sys.argv))

MoinMoin: ScriptMarket/HTMLImportScript (last edited 2007-10-29 19:19:22 by localhost)