Attachment 'quote.py'
Download 1 """ Quote and unquote wiki names
2
3 This will go into wikiutil and test_wikiutil. Because the tests are
4 broken at this time, I working on this file.
5 """
6
7 import unittest
8 import re
9 import time
10 import sys
11
12 class InvalidFileNameError(Exception): pass
13
14 # precompiled paterns
15 UNSAFE = re.compile(r'[^a-zA-Z0-9]+')
16 QUOTED = re.compile(r'\(([^\)]*)\)?')
17 QUOTED_VALID = re.compile(r'\(([a-fA-F0-9]+)\)')
18 QUOTED_PRE_13 = re.compile(r'(_[a-zA-Z0-9]{2})+')
19 QUOTED_DEV_13 = re.compile(r'(\([a-zA-Z0-9]{2}\))+')
20
21 # Some mock objects so we can test without moin moin
22 class config:
23 charset = 'utf8'
24
25 def decodeUserInput(string, charsets=[config.charset]):
26 return string.decode(charsets[0])
27
28 def quoteWikinameFS(wikiname, charset=config.charset):
29 """ Return file system representation of a Unicode WikiName.
30
31 Todo: We should use only unicode encoding for file names, to prevent
32 possible configuration errror.
33
34 @param wikiname: Unicode string possibly containing non-ascii characters
35 @param charset: charset to encode string
36 @rtype: string
37 @return: quoted name, safe for any file system
38 """
39 filename = wikiname.encode(charset)
40
41 quoted = []
42 start = 0
43 needles = UNSAFE.finditer(filename)
44 for needle in needles:
45 # append leading safe stuff
46 quoted.append(filename[start:needle.start()])
47 start = needle.end()
48 # Quote and append unsafe stuff
49 quoted.append('(')
50 for character in needle.group():
51 quoted.append('%02x' % ord(character))
52 quoted.append(')')
53
54 # append rest of string
55 quoted.append(filename[start:len(filename)])
56 return ''.join(quoted)
57
58
59 def unquoteWikiname_valid_only_re(filename, charsets=[config.charset], safe=1):
60 """ Return Unicode WikiName from quoted file name.
61
62 We raise an InvalidFileNameError if we find an invalid name, so the
63 wiki could alarm the admin or suggest the user to rename a page.
64 Invalid file names should never happen in normal use, but are rather
65 cheap to find.
66
67 @param filename: string using charset and possible quoted parts
68 @param charset: charset used by string
69 @rtype: Unicode String
70 @return: WikiName
71 """
72 ### Temporary fix start ###
73 # from some places we get called with unicode
74 if isinstance(filename, type(u'')):
75 filename = filename.encode(config.charset)
76 ### Temporary fix end ###
77
78 parts = []
79 start = 0
80 needles = QUOTED_VALID.finditer(filename)
81 for needle in needles:
82 # append leading unquoted stuff
83 parts.append(filename[start:needle.start()])
84 start = needle.end()
85 # Append quoted stuff
86 group = needle.group(1)
87 # Filter invalid filenames
88 if (len(group) % 2 != 0):
89 raise InvalidFileNameError(filename)
90 try:
91 for i in range(0, len(group), 2):
92 byte = group[i:i+2]
93 character = chr(int(byte, 16))
94 parts.append(character)
95 except ValueError:
96 # byte not in hex, e.g 'xy'
97 raise InvalidFileNameError(filename)
98
99 # append rest of string
100 if start == 0:
101 wikiname = filename
102 else:
103 parts.append(filename[start:len(filename)])
104 wikiname = ''.join(parts)
105 # Filter invalid filenames. Any left (xx) must be invalid
106 if safe and ('(' in wikiname or ')' in wikiname):
107 raise InvalidFileNameError(filename)
108 return decodeUserInput(wikiname, charsets)
109
110
111 def unquoteWikiname_re(filename, charsets=[config.charset], safe=1):
112 """ Return Unicode WikiName from quoted file name.
113
114 We raise an InvalidFileNameError if we find an invalid name, so the
115 wiki could alarm the admin or suggest the user to rename a page.
116 Invalid file names should never happen in normal use, but are rather
117 cheap to find.
118
119 @param filename: string using charset and possible quoted parts
120 @param charset: charset used by string
121 @rtype: Unicode String
122 @return: WikiName
123 """
124 ### Temporary fix start ###
125 # from some places we get called with unicode
126 if isinstance(filename, type(u'')):
127 filename = filename.encode(config.charset)
128 ### Temporary fix end ###
129
130 parts = []
131 start = 0
132 needles = QUOTED.finditer(filename)
133 for needle in needles:
134 # append leading unquoted stuff
135 parts.append(filename[start:needle.start()])
136 start = needle.end()
137 # Append quoted stuff
138 group = needle.group(1)
139 # Filter invalid filenames
140 if (safe and
141 not needle.group().endswith(')') or
142 len(group) == 0 or
143 len(group) % 2 != 0):
144 raise InvalidFileNameError(filename)
145 try:
146 for i in range(0, len(group), 2):
147 byte = group[i:i+2]
148 character = chr(int(byte, 16))
149 parts.append(character)
150 except ValueError:
151 # byte not in hex, e.g 'xy'
152 raise InvalidFileNameError(filename)
153
154 # append rest of string
155 if start == 0:
156 wikiname = filename
157 else:
158 parts.append(filename[start:len(filename)])
159 wikiname = ''.join(parts)
160 # Filter invalid file name with lonely ")"
161 if safe and ')' in wikiname:
162 raise InvalidFileNameError(filename)
163 return decodeUserInput(wikiname, charsets)
164
165
166 def unquoteWikiname(filename, charsets=[config.charset], safe=1):
167 """ Return Unicode WikiName from quoted file name.
168
169 We raise an InvalidFileNameError if we find an invalid name, so the
170 wiki could alarm the admin or suggest the user to rename a page.
171 Invalid file names should never happen in normal use, but are rather
172 cheap to find.
173
174 @param filename: string using charset and possible quoted parts
175 @param charset: charset used by string
176 @rtype: Unicode String
177 @return: WikiName
178 """
179 ### Temporary fix start ###
180 # from some places we get called with unicode
181 if isinstance(filename, type(u'')):
182 filename = filename.encode(config.charset)
183 ### Temporary fix end ###
184
185 parts = []
186 start = 0 # Start of the quoted sequence
187 end = 0 # The character after the end of the last quoted sequence
188 while 1:
189 start = filename.find('(', end)
190 if start != -1:
191 # append leading unquoted stuff
192 parts.append(filename[end:start])
193 start += 1 # skip the "("
194 end = filename.find(')', start)
195 # Filter invalid filenames
196 if safe and end == -1 or end == start or (end - start) % 2 != 0:
197 raise InvalidFileNameError(filename)
198 # Append quoted stuff
199 try:
200 for i in range(start, end, 2):
201 byte = filename[i:i+2]
202 character = chr(int(byte, 16))
203 parts.append(character)
204 except ValueError:
205 # byte not in hex, e.g 'xy'
206 raise InvalidFileNameError(filename)
207 end += 1 # skip the ")"
208 else:
209 # append rest of string and break
210 if end == 0:
211 wikiname = filename
212 else:
213 parts.append(filename[end:len(filename)])
214 wikiname = ''.join(parts)
215 # Filter invalid filenames
216 if safe and ")" in wikiname:
217 raise InvalidFileNameError(filename)
218 break
219
220 return decodeUserInput(wikiname, charsets)
221
222
223 def convertPre13FileName(filename, newEncoding='utf8'):
224 """ Return new style quoted filename from pre 1.3 quoted file name.
225
226 Unquote filename and invoke quoteWikinameForFileSystem()
227
228 @param filename: string using charset and possible pre 1.3 quoting
229 @rtype: String
230 @return: New style quoted filename
231 """
232 parts = []
233 start = 0
234 needles = QUOTED_PRE_13.finditer(filename)
235 for needle in needles:
236 # append leading unquoted stuff
237 parts.append(filename[start:needle.start()])
238 start = needle.end()
239
240 # Append quoted stuff
241 group = needle.group()
242 for i in range(0, len(group), 3):
243 byte = group[i+1:i+3]
244 character = chr(int(byte, 16))
245 parts.append(character)
246
247 # append rest of string
248 parts.append(filename[start:len(filename)])
249
250 wikiName = ''.join(parts).decode('utf8')
251 return quoteWikinameFS(wikiName, charset=newEncoding)
252
253
254 def convertDev13FileName(filename, oldEncoding='utf8', newEncoding='utf8'):
255 """ Return new style quoted filename from development 1.3 quoted
256 file name.
257
258 Unquote filename and invoke quoteWikinameForFileSystem()
259
260 1.3 development used config.charset to encode file
261 names. Theoretically, someone might have used non Unicode to encode
262 file names.
263
264 @param filename: string using charset and possible dev 1.3 quoting
265 @param charset: charset used by string
266 @rtype: String
267 @return: New style quoted filename
268 """
269 parts = []
270 start = 0
271 needles = QUOTED_DEV_13.finditer(filename)
272 for needle in needles:
273 # append leading unquoted stuff
274 parts.append(filename[start:needle.start()])
275 start = needle.end()
276
277 # Append quoted stuff
278 group = needle.group()
279 for i in range(0, len(group), 4):
280 byte = group[i+1:i+3]
281 character = chr(int(byte, 16))
282 parts.append(character)
283
284 # append rest of string
285 parts.append(filename[start:len(filename)])
286
287 wikiName = ''.join(parts).decode(oldEncoding)
288 return quoteWikinameFS(wikiName, charset=newEncoding)
289
290
291 class QuoteTestCase(unittest.TestCase):
292 """ wikiutil quoting tests """
293
294 TESTS = (
295 # WikiName, Quoted
296 # Space
297 (u' ', '(20)'),
298 # Plain ASCII
299 (u'WikiName', 'WikiName'),
300 # Free links
301 (u'free link', 'free(20)link'),
302 # Underscore link
303 (u'underscore_link', 'underscore(5f)link'),
304 # Subpages
305 (u'Page/SubPage/SubSubPage', 'Page(2f)SubPage(2f)SubSubPage'),
306 # Hebrew
307 (u'\u05e0\u05d9\u05e8', '(d7a0d799d7a8)'),
308 # Hebrew sub pages
309 (u'\u05e0\u05d9\u05e8\u002f\u05e0\u05d9\u05e8',
310 '(d7a0d799d7a82fd7a0d799d7a8)'),
311 # Combination
312 (u'Page\u002f\u05e0\u05d9\u05e8\u002fSubPage',
313 'Page(2fd7a0d799d7a82f)SubPage'),
314 # Add more tests
315 )
316
317 def testQuoteWikiNameForFileSystem(self):
318 """ wikiutil: quote wiki names for file system """
319 for test, expected in self.TESTS:
320 result = quoteWikinameFS(test, charset='utf8')
321 self.failUnlessEqual(result, expected,
322 'expected "%(expected)s" but got "%(result)s"' % locals())
323
324 def testunQuoteFileName(self):
325 """ wikiutil: unquote file using .find() """
326 for expected, test in self.TESTS:
327 result = unquoteWikiname(test, charsets=['utf8'])
328 self.failUnlessEqual(result, expected,
329 'expected "%(expected)s" but got "%(result)s"' % locals())
330
331 def testunQuoteFileName_re(self):
332 """ wikiutil: unquote file names using regex """
333 for expected, test in self.TESTS:
334 result = unquoteWikiname_re(test, charsets=['utf8'])
335 self.failUnlessEqual(result, expected,
336 'expected "%(expected)s" but got "%(result)s"' % locals())
337
338 def testunQuoteFileName_valid_only_re(self):
339 """ wikiutil: unquote file using valid only regex """
340 for expected, test in self.TESTS:
341 result = unquoteWikiname_valid_only_re(test, charsets=['utf8'])
342 self.failUnlessEqual(result, expected,
343 'expected "%(expected)s" but got "%(result)s"' % locals())
344
345
346 class InvalidFileNameTestCase(unittest.TestCase):
347 """ wikiutil quoting tests """
348
349 TESTS = (
350 # Quoted file names
351 'A()B', # Empty braces
352 'A(d7a)B', # Odd number of quoted characters
353 'A(xy)B', # Non hex characters
354 'A(2f', # Open sequence
355 'A2f)', # Lonely close tag
356 # Add more tests
357 )
358
359 def testInvalidFileName(self):
360 """ wikiutil: invalid file names raise exception using find """
361 for test in self.TESTS:
362 self.failUnlessRaises(InvalidFileNameError,
363 unquoteWikiname, test, charsets=['utf8'])
364
365 def testInvalidFileNameRE(self):
366 """ wikiutil: invalid file names raise exception using re"""
367 for test in self.TESTS:
368 self.failUnlessRaises(InvalidFileNameError,
369 unquoteWikiname_re, test, charsets=['utf8'])
370
371 def testInvalidFileNameValidOnlyRE(self):
372 """ wikiutil: invalid file names raise exception using valid only re"""
373 for test in self.TESTS:
374 self.failUnlessRaises(InvalidFileNameError,
375 unquoteWikiname_valid_only_re, test, charsets=['utf8'])
376
377
378 class ConvertPre13FileNameTestCase(unittest.TestCase):
379 """ Convert pre 1.3 file names to 1.3 file names
380
381 Pre 1.3 used different quoting - each quoted character was prefixed
382 by an underscode e.g._2f.
383 """
384
385 TESTS = (
386 # New style quoting, Pre 1.3 style quoting
387 # Space
388 ('(20)', '_20'),
389 # Plain ASCII
390 ('WikiName', 'WikiName'),
391 # Free link
392 ('free(20)link', 'free_20link'),
393 # Underscore link
394 ('underscore(5f)link', 'underscore_5flink'),
395 # Subpages
396 ('Page(2f)SubPage', 'Page_2fSubPage'),
397 # Hebrew
398 ('(d7a0d799d7a8)', '_d7_a0_d7_99_d7_a8'),
399 # Hebrew sub pages
400 ('(d7a0d799d7a82fd7a0d799d7a8)',
401 '_d7_a0_d7_99_d7_a8_2f_d7_a0_d7_99_d7_a8'),
402 # Combination
403 ('Page(2fd7a0d799d7a82f)SubPage',
404 'Page_2f_d7_a0_d7_99_d7_a8_2fSubPage'),
405 # Add more tests
406 )
407
408 def testConvertPre13FileName(self):
409 """ wikiutil: Convert pre 1.3 file names to 1.3 file names """
410 for expected, test in self.TESTS:
411 result = convertPre13FileName(test, newEncoding='utf8')
412 self.failUnlessEqual(result, expected,
413 'expected %(expected)s but got %(result)s' % locals())
414
415
416 class ConvertDev13FileNameTestCase(unittest.TestCase):
417 """ Convert development 1.3 file names to 1.3 file names
418
419 Currently test only utf8 encoded names. Theoretically, someone might
420 have used other encoding - the dev 1.3 code use the wiki charset
421 encoding.
422
423 Development 1.3 used different quoting - each quoted character was
424 enclosed in bracdes e.g. (xx)(xx).
425 """
426
427 TESTS = (
428 # New style quoting, Dev 1.3 style quoting
429 # Space
430 ('(20)', '(20)'),
431 # Plain ASCII
432 ('WikiName', 'WikiName'),
433 # Subpages
434 ('Page(2f)SubPage', 'Page(2f)SubPage'),
435 # Hebrew
436 ('(d7a0d799d7a8)', '(d7)(a0)(d7)(99)(d7)(a8)'),
437 # Hebrew sub pages
438 ('(d7a0d799d7a82fd7a0d799d7a8)',
439 '(d7)(a0)(d7)(99)(d7)(a8)(2f)(d7)(a0)(d7)(99)(d7)(a8)'),
440 # Combination
441 ('Page(2fd7a0d799d7a82f)SubPage',
442 'Page(2f)(d7)(a0)(d7)(99)(d7)(a8)(2f)SubPage'),
443 # Add more tests
444 )
445
446 def testConvertDev13FileName(self):
447 """ wikiutil: Convert development 1.3 file names to 1.3 file names """
448 for expected, test in self.TESTS:
449 result = convertDev13FileName(test,
450 oldEncoding='utf8',
451 newEncoding='utf8')
452 self.failUnlessEqual(result, expected,
453 'expected %(expected)s but got %(result)s' % locals())
454
455 class TimeUnquoteWikinameTestCase(unittest.TestCase):
456 """ Time unquoting wiki names
457
458 I'm not sure if we should use the re version which is little more
459 clean, or the find version.
460
461 This also test how expensive are the invalid file names.
462 """
463
464 # Tests contain a mix of names, typical for multi language wiki
465 TESTS = (
466 # Typical Plain ASCII names
467 'RecentChanges', 'FrontPage', 'UserPreferences', 'WhyWikiWorks',
468 # Extended name
469 'extented(20)page(20)name',
470 # Underscored name
471 'underscored(5f)page(5f)name',
472 # Subpages
473 'ParentPage(2f)SubPage(2f)SubSubPage',
474 # Hebrew (Recent Changes)
475 '(d7a9d799d7a0d795d799d79d20d790d797d7a8d795d7a0d799d79d)',
476 # Hebrew sub pages (RecentChagnes/RecentChanges)
477 ('(d7a9d799d7a0d795d799d79d20d790d797d7a8d795d7a0d799d79d'
478 '2f'
479 'd7a9d799d7a0d795d799d79d20d790d797d7a8d795d7a0d799d79d)'),
480 # Combination
481 '(d7a9d799d7a0d795d799d79d20d790d797d7a8d795d7a0d799d79d2f)SubPage'
482 # Add more tests
483 )
484
485 def time(self, callable, **kw):
486 start = time.time()
487 for i in range(100):
488 for test in self.TESTS:
489 callable(test, **kw)
490 sys.stdout.write('%.4fs ' % (time.time() - start))
491 sys.stdout.flush()
492
493 def testSafeFind(self):
494 """ Time unquoteWikiname using find safely: """
495 self.time(unquoteWikiname, safe=1)
496
497 def testFind(self):
498 """ Time unquoteWikiname using find: """
499 self.time(unquoteWikiname, safe=0)
500
501 def testSafeRE(self):
502 """ Time unquoteWikiname using regex safely: """
503 self.time(unquoteWikiname_re, safe=1)
504
505 def testRE(self):
506 """ Time unquoteWikiname using regex: """
507 self.time(unquoteWikiname_re, safe=0)
508
509 def testValidOnlyRE(self):
510 """ Time unquoteWikiname using valid only regex safely: """
511 self.time(unquoteWikiname_valid_only_re, safe=1)
512
513 def testSafeValidOnlyRE(self):
514 """ Time unquoteWikiname using valid only regex: """
515 self.time(unquoteWikiname_valid_only_re, safe=0)
516
517
518 if __name__ == '__main__':
519 suite = unittest.TestSuite()
520 suite.addTest(unittest.makeSuite(QuoteTestCase))
521 suite.addTest(unittest.makeSuite(InvalidFileNameTestCase))
522 suite.addTest(unittest.makeSuite(ConvertPre13FileNameTestCase))
523 suite.addTest(unittest.makeSuite(ConvertDev13FileNameTestCase))
524 suite.addTest(unittest.makeSuite(TimeUnquoteWikinameTestCase))
525 unittest.TextTestRunner(verbosity=2).run(suite)
526
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.