A solution to the strong em and u text attributes, that cause many problems, because of open tags or mixed tags like <a><b></a></b>.

The idea is keep an attributes list in the parser, and when calling formatter.text(), call with an extra variable, attributes, which is a list of active attributes. The formatter does not have to export strong(), em() or underline() methods - and we never call these methods to open and close those.

We have to call text any time the the text attributes change: Here is plain text, here is bold text, here is more plain text, here is bold and underlined text... The formatter format each run of text using its private methods.

Here is a minimal parser that implements that idea (second version more like our current parser, will be easier to merge it in):

   1 """
   2 attributed text parser
   3 """
   4 
   5 import re
   6 
   7 scanner = [
   8     r"(?P<strong>''')",
   9     r"(?P<em>'')",
  10     r"(?P<u>__)",
  11     ]
  12 scanner = re.compile('|'.join(scanner))
  13 
  14 textAttribtues = ['em', 'strong', 'u']
  15 
  16 
  17 class Parser:
  18     """ Attributed text parser """
  19     
  20     def __init__(self):
  21         self.attributes = {}
  22         self.output = []
  23             
  24     def scan(self, line):
  25         pos = 0
  26         for match in scanner.finditer(line):
  27             # Add text before match
  28             if pos < match.start():
  29                 self.addText(line[pos:match.start()])
  30             self.replace(match)
  31             pos = match.end()
  32                 
  33         # Add text after match
  34         if pos < len(line) -1:
  35             self.addText(line[pos:])
  36     
  37     def replace(self, match):
  38         """ Handle key or call handler for key """
  39         key, val = self.getKeyval(match)
  40         if key in textAttribtues:
  41             self.updateAttribtues(key)
  42         else:
  43             handler = getattr(self, '_repl_' + key)
  44             handler(match)
  45     
  46     def updateAttribtues(self, key):
  47         """ Update text attributes state """
  48         if key in self.attributes:
  49             del self.attributes[key]
  50         else:
  51             self.attributes[key] = 1
  52 
  53     def addText(self, text):
  54         attributes = self.attributes.keys()
  55         attributes.sort()
  56         self.output.append((text, attributes))
  57 
  58     def getKeyval(self, match):
  59         # Get match name
  60         for key, val in match.groupdict().items():
  61             if val:
  62                 return key, val
  63                 
attrparse.py

Here are the tests this parser pass:

   1 # -*- coding: utf-8 -*-
   2 """
   3 attribute parser tests
   4 """
   5 
   6 import unittest
   7 import attrparse        
   8 
   9 class ScanTestCase(unittest.TestCase):
  10     
  11     def setUp(self):
  12         self.parser = attrparse.Parser()
  13     
  14     def testPlain(self):
  15         """ parse text with no attribute """
  16         self.parser.scan("plain")
  17         expected = [('plain', [])]
  18         self.assertEqual(expected, self.parser.output)
  19 
  20     def testSingle(self):
  21         """ parse text with single attribute """
  22         self.parser.scan("'''strong'''")
  23         expected = [('strong', ['strong'])]
  24         self.assertEqual(expected, self.parser.output)
  25 
  26     def testPlainAndSingle(self):
  27         """ parse plain text then attributed """
  28         self.parser.scan("plain '''strong'''")
  29         expected = [
  30             ('plain ', []),
  31             ('strong', ['strong']),
  32             ]
  33         self.assertEqual(expected, self.parser.output)
  34 
  35     def testSingleAndPlain(self):
  36         """ parse attributed then plain text """
  37         self.parser.scan("'''strong''' plain")
  38         expected = [
  39             ('strong', ['strong']),
  40             (' plain', []),
  41             ]
  42         self.assertEqual(expected, self.parser.output)
  43 
  44     def testAllSingles(self):
  45         """ parse all single attributes """
  46         text = "plain '''strong''' ''em'' __u__ plain"
  47         self.parser.scan(text)
  48         expected = [
  49             ('plain ', []),
  50             ('strong', ['strong']),
  51             (' ', []),
  52             ('em', ['em']),
  53             (' ', []),
  54             ('u', ['u']),
  55             (' plain', []),
  56             ]
  57         self.assertEqual(expected, self.parser.output)
  58 
  59     def testMultipleAttribute(self):
  60         """ parse multiple attribute per run of text """
  61         text = "'''''__strong em u__'''''"
  62         self.parser.scan(text)
  63         expected = [
  64             ('strong em u', ['em', 'strong', 'u']),
  65             ]
  66         self.assertEqual(expected, self.parser.output)
  67 
  68     def testNestedAttribtues(self):
  69         """ parse nested attributes <a><b></b></a> """
  70         text = "'''strong ''strong em'''''"
  71         self.parser.scan(text)
  72         expected = [
  73             ('strong ', ['strong']),
  74             ('strong em', ['em', 'strong']),
  75             ]
  76         self.assertEqual(expected, self.parser.output)
  77 
  78     def testCrossedAttribtues(self):
  79         """ parse mixed attributes <a><b></a></b> """
  80         text = "'''strong ''strong em''' em''"
  81         self.parser.scan(text)
  82         expected = [
  83             ('strong ', ['strong']),
  84             ('strong em', ['em', 'strong']),
  85             (' em', ['em']),
  86             ]
  87         self.assertEqual(expected, self.parser.output)
  88 
  89     def testOpenAttribtue(self):
  90         """ parse open attribute <a> """
  91         text = "plain '''strong"
  92         self.parser.scan(text)
  93         expected = [
  94             ('plain ', []),
  95             ('strong', ['strong']),
  96             ]
  97         self.assertEqual(expected, self.parser.output)
  98 
  99 
 100 def suite():
 101     test_cases = [unittest.makeSuite(obj, 'test') 
 102         for name, obj in globals().items()
 103         if name.endswith('TestCase')]
 104     return unittest.TestSuite(test_cases)
 105     
 106 if __name__ == '__main__':
 107     unittest.TextTestRunner(verbosity=2).run(suite())
test_attrparse.py

MoinMoin: ParserMarket/AttributedTextParser (last edited 2007-10-29 19:08:30 by localhost)