Attachment 'search.py'

Download

   1 """
   2     MoinMoin search engine
   3     
   4     @copyright: Florian Festi TODO: email
   5     @license: GNU GPL, see COPYING for details
   6 """
   7 
   8 import re, time, sys, StringIO
   9 from MoinMoin import wikiutil, config
  10 from MoinMoin.Page import Page
  11 
  12 #try:
  13 #    import xapian
  14 #except ImportError:
  15 #    xapian = False
  16 
  17 #############################################################################
  18 ### query objects
  19 #############################################################################
  20 
  21 class BaseExpression:
  22     """ Base class for all search terms """
  23     
  24     def __init__(self):
  25         self.negated = 0
  26 
  27     def __str__(self):
  28         return unicode(self).encode(config.charset, 'replace')
  29 
  30     def negate(self):
  31         """ Negate the result of this term """
  32         self.negated = 1 
  33 
  34     def search(self, page):
  35         """ Search a page
  36 
  37         Returns a list of Match objects or None if term didn't find
  38         anything (viceversa if negate() was called).  Terms containing
  39         other terms must call this method to aggregate the results.
  40         This Base class returns True (Match()) if not negated.
  41         """
  42         if self.negated:
  43             # XXX why?
  44             return [Match()]
  45         else:
  46             return None
  47     
  48     def costs(self):
  49         """ estimated time to calculate this term
  50         
  51         Number is is relative to other terms and has no real unit.
  52         It allows to do the fast searches first.
  53         """ 
  54         return 0
  55 
  56     def highlight_re(self):
  57         """ Return a regular expression of what the term searches for
  58 
  59         Used to display the needle in the page.
  60         """
  61         return ''
  62 
  63     def indexed_query(self):
  64         """ Experimental/unused
  65 
  66         May become interface to the indexing search engine
  67         """
  68         return self
  69 
  70     def _build_re(self, pattern, use_re=False, case=False):
  71         """ Make a regular expression out of a text pattern """
  72         if case:
  73             # case sensitive
  74             flags = re.U
  75         else:
  76             # ignore case
  77             flags = re.U | re.I
  78             
  79         if use_re:
  80             try:
  81                 self.search_re = re.compile(pattern, flags)
  82             except re.error:
  83                 pattern = re.escape(pattern)
  84                 self.pattern = pattern
  85                 self.search_re = re.compile(pattern, flags)
  86         else:
  87             pattern = re.escape(pattern)
  88             self.search_re = re.compile(pattern, flags)
  89             self.pattern = pattern
  90 
  91 
  92 class AndExpression(BaseExpression):
  93     """ A term connecting several subterms with a logical AND """
  94 
  95     operator = ' '
  96 
  97     def __init__(self, *terms):
  98         self._subterms = list(terms)
  99         self._costs = 0
 100         for t in self._subterms:
 101             self._costs += t.costs()
 102         self.negated = 0
 103 
 104     def append(self, expression):
 105         """ Append another term """
 106         self._subterms.append(expression)
 107         self._costs += expression.costs()
 108 
 109     def subterms(self):
 110         return self._subterms
 111     
 112     def costs(self):
 113         return self._costs
 114 
 115     def __unicode__(self):
 116         result = ''
 117         for t in self._subterms:
 118             result += self.operator + t
 119         return u'[' + result[len(self.operator):] + u']'
 120 
 121     def pageFilter(self):
 122         """ Return a page filtering function
 123 
 124         This function is used to filter page list before we search
 125         it.
 126 
 127         Return a function that get a page name, and return bool.
 128         """
 129         # Sort terms by cost, then get all title searches
 130         self.sortByCost()
 131         terms = [term for term in self._subterms
 132                  if isinstance(term, TitleSearch)]
 133         if terms:
 134             # Create and return a filter function
 135             def filter(name):
 136                 """ A function that return True if all terms filter name """
 137                 for term in terms:
 138                     filter = term.pageFilter()
 139                     if not filter(name):
 140                         return False
 141                 return True
 142             return filter
 143         
 144         return None
 145 
 146     def sortByCost(self):
 147         tmp = [(term.costs(), term) for term in self._subterms]
 148         tmp.sort()
 149         self._subterms = [item[1] for item in tmp]       
 150 
 151     def search(self, page):
 152         """ Search for each term, cheap searches first """
 153         self.sortByCost()
 154         matches = []
 155         for term in self._subterms:
 156             result = term.search(page)
 157             if not result:
 158                 return None
 159             matches.extend(result)
 160         return matches
 161 
 162     def highlight_re(self):
 163         result = []
 164         for s in self._subterms:
 165             highlight_re = s.highlight_re()
 166             if highlight_re: result.append(highlight_re)
 167             
 168         return '|'.join(result)
 169 
 170     def indexed_query(self):
 171         indexed_terms = []
 172         sub_terms = []
 173         for term in self._subterms:
 174             term = term.indexed_query()
 175             if term is isinstance(BaseExpression):
 176                 subterms.append(term)
 177             else:
 178                 indexed_terms.append(term)
 179 
 180         if indexed_terms:
 181 
 182             if not sub_terms:
 183                 return indexed_terms
 184 
 185     def indexed_search(self):
 186         if self.indexed_query:
 187             indexed_result = self.indexed_query.indexed_query()
 188             result = []
 189             for foundpage in indexed_result:
 190                 matches = self.search(foundpage.page)
 191                 if matches:
 192                     result.append(foundpage)
 193                     foundpage.add_matches(matches)
 194 
 195 
 196 class OrExpression(AndExpression):
 197     """ A term connecting several subterms with a logical OR """
 198     
 199     operator = ' or '
 200 
 201     def search(self, page):
 202         """ Search page with terms, cheap terms first
 203 
 204         XXX Do we have any reason to sort here? we are not breaking out
 205         of the search in any case.
 206         """
 207         self.sortByCost()
 208         matches = []
 209         for term in self._subterms:
 210             result = term.search(page)
 211             if result:
 212                 matches.extend(result)
 213         return matches
 214 
 215 
 216 class TextSearch(BaseExpression):
 217     """ A term that does a normal text search
 218 
 219     Both page content and the page title are searched, using an
 220     additional TitleSearch term.
 221     """
 222     
 223     def __init__(self, pattern, use_re=False, case=False):
 224         """ Init a text search
 225 
 226         @param pattern: pattern to search for, ascii string or unicode
 227         @param use_re: treat pattern as re of plain text, bool
 228         @param case: do case sensitive search, bool 
 229         """
 230         self._pattern = unicode(pattern)
 231         self.negated = 0
 232         self._build_re(self._pattern,
 233                        use_re=use_re, case=case)
 234         self.titlesearch = TitleSearch(self._pattern, use_re=use_re, case=case)
 235         
 236     def costs(self):
 237         return 10000
 238     
 239     def __unicode__(self):
 240         if self.negated: neg = '-'
 241         else: neg = ''
 242         return u'%s"%s"' % (neg, unicode(self._pattern))
 243 
 244     def highlight_re(self):
 245         return u"(%s)" % self._pattern
 246 
 247     def pageFilter(self):
 248         """ Page filter function for single text search """
 249         return None
 250 
 251     def search(self, page):
 252         matches = []
 253 
 254         # Search in page name
 255         results = self.titlesearch.search(page)
 256         if results:
 257             matches.extend(results)
 258 
 259         # Search in page body
 260         body = page.get_raw_body()
 261         for match in self.search_re.finditer(body):
 262             matches.append(TextMatch(match.start(),match.end()))
 263 
 264         # Decide what to do with the results.
 265         if ((self.negated and matches) or
 266             (not self.negated and not matches)):
 267             return None
 268         elif matches:
 269             return matches
 270         else:
 271             # XXX why not return None or empty list?
 272             return [Match()]
 273 
 274     def indexed_query(self):
 275         return xapian.Query(self._pattern)
 276 
 277 
 278 class TitleSearch(BaseExpression):
 279     """ Term searches in pattern in page title only """
 280 
 281     def __init__(self, pattern, use_re=False, case=False):
 282         """ Init a title search
 283 
 284         @param pattern: pattern to search for, ascii string or unicode
 285         @param use_re: treat pattern as re of plain text, bool
 286         @param case: do case sensitive search, bool 
 287         """
 288         self._pattern = pattern
 289         self.negated = 0
 290         self._build_re(unicode(pattern), use_re=use_re, case=case)
 291         
 292     def costs(self):
 293         return 100
 294 
 295     def __unicode__(self):
 296         if self.negated: neg = '-'
 297         else: neg = ''
 298         return u'%s!"%s"' % (neg, unicode(self._pattern))
 299 
 300     def highlight_re(self):
 301         return u"(%s)" % self._pattern    
 302 
 303     def pageFilter(self):
 304         """ Page filter function for single title search """
 305         def filter(name):
 306             match = self.search_re.search(name)
 307             if ((self.negated and match) or
 308                 (not self.negated and not match)):
 309                 return False
 310             return True
 311         return filter
 312             
 313     def search(self, page):
 314         # Get matches in page name
 315         matches = []
 316         for match in self.search_re.finditer(page.page_name):
 317             matches.append(TitleMatch(match.start(),match.end()))
 318         
 319         if ((self.negated and matches) or
 320             (not self.negated and not matches)):
 321             return None
 322         elif matches:
 323             return matches
 324         else:
 325             # XXX why not return None or empty list?
 326             return [Match()]
 327 
 328     def indexed_query(self):
 329         return self
 330 
 331     
 332 class IndexedQuery:
 333     """unused and experimental"""
 334     def __init__(self, queryobject):
 335         self.queryobject = queryobject
 336     def indexed_search(self):
 337         pass
 338         # return list of results
 339     
 340 
 341 ############################################################################
 342 ### Results
 343 ############################################################################
 344 
 345 class Match:
 346     """ Base class for all Matches (found pieces of pages).
 347     
 348     This class represents a empty True value as returned from negated searches.
 349     """
 350     # Default match weight
 351     _weight = 1.0
 352     
 353     def __init__(self, start=0, end=0):
 354         self.start = start
 355         self.end = end
 356 
 357     def __len__(self):
 358         return self.end - self.start
 359 
 360     def __eq__(self, other):
 361         equal = (self.__class__ == other.__class__ and
 362                  self.start == other.start and
 363                  self.end == other.end)
 364         return equal
 365         
 366     def __ne__(self, other):
 367         return not self.__eq__(other)
 368 
 369     def view(self):
 370         return ''
 371 
 372     def weight(self):
 373         return self._weight
 374 
 375 
 376 class TextMatch(Match):
 377     """ Represents a match in the page content """
 378     pass
 379 
 380 
 381 class MatchInAttachment(Match):
 382     """ Represents a match in a attachment content
 383 
 384     Not used yet.
 385     """
 386     pass
 387 
 388 
 389 class TitleMatch(Match):
 390     """ Represents a match in the page title
 391     
 392     Has more weight as a match in the page content.
 393     """
 394     # Matches in titles are much more important in wikis. This setting
 395     # seems to make all pages that have matches in the title to appear
 396     # before pages that their title does not match.
 397     _weight = 100.0
 398 
 399 
 400 class FoundPage:
 401     """ Represents a page in a search result """
 402 
 403     def __init__(self, page_name, matches=None, page=None):
 404         self.page_name = page_name
 405         self.page = page
 406         if matches is None:
 407             matches = []
 408         self._matches = matches
 409 
 410     def weight(self, unique=1):
 411         """ returns how important this page is for the terms searched for
 412 
 413         Summarize the weight of all page matches
 414 
 415         @param unique: ignore identical matches
 416         @rtype: int
 417         @return: page weight
 418         """
 419         weight = 0
 420         for match in self.get_matches(unique=unique):
 421             weight += match.weight()
 422             # More sophisticated things to be added, like increase
 423             # weight of near matches.
 424         return weight
 425 
 426     def add_matches(self, matches):
 427         """ Add found matches """
 428         self._matches.extend(matches)
 429 
 430     def get_matches(self, unique=1, sort='start', type=Match):
 431         """ Return all matches of type sorted by sort
 432 
 433         @param unique: return only unique matches (bool)
 434         @param sort: match attribute to sort by (string)
 435         @param type: type of match to return (Match or sub class) 
 436         @rtype: list
 437         @return: list of matches
 438         """
 439         if unique:
 440             matches = self._unique_matches(type=type)
 441             if sort == 'start':
 442                 # matches already sorted by match.start, finished.
 443                 return matches
 444         else:
 445             matches = self._matches
 446 
 447         # Filter by type and sort by sort using fast schwartzian
 448         # transform.
 449         if sort == 'start':
 450             tmp = [(match.start, match) for match in matches
 451                    if instance(match, type)]
 452         else:
 453             tmp = [(match.weight(), match) for match in matches
 454                    if instance(match, type)]
 455         tmp.sort()
 456         if sort == 'weight':
 457             tmp.reverse()
 458         matches = [item[1] for item in tmp]
 459         
 460         return matches
 461 
 462     def _unique_matches(self, type=Match):
 463         """ Get a list of unique matches of type
 464 
 465         The result is sorted by match.start, because its easy to remove
 466         duplicates like this.
 467 
 468         @param type: type of match to return
 469         @rtype: list
 470         @return: list of matches of type, sorted by match.start
 471         """
 472         # Filter by type and sort by match.start using fast schwartzian
 473         # transform.
 474         tmp = [(match.start, match) for match in self._matches
 475                if isinstance(match, type)]
 476         tmp.sort()
 477 
 478         if not len(tmp):
 479             return []
 480 
 481         # Get first match into matches list
 482         matches = [tmp[0][1]]
 483 
 484         # Add rest of matches ignoring identical matches
 485         for item in tmp[1:]:
 486             if item[1] == matches[-1]:
 487                 continue
 488             matches.append(item[1])
 489 
 490         return matches
 491     
 492 
 493 class FoundAttachment(FoundPage):
 494     """ Represent an attachment in search results """
 495     pass
 496 
 497 
 498 ##############################################################################
 499 ### Parse Query
 500 ##############################################################################
 501 
 502 
 503 class QueryParser:
 504     """
 505     Converts a String into a tree of Query objects
 506     using recursive top/down parsing
 507     """
 508 
 509     def __init__(self, **kw):
 510         """
 511         @keyword titlesearch: treat all terms as title searches
 512         @keyword case: do case sensitive search
 513         @keyword regex: treat all terms as regular expressions
 514         """
 515         self.titlesearch = kw.get('titlesearch', 0)
 516         self.case = kw.get('case', 0)
 517         self.regex = kw.get('regex', 0)
 518 
 519     def parse_query(self, query):
 520         """ transform an string into a tree of Query objects"""
 521         self._query = query
 522         result = self._or_expression()
 523         if result is None:
 524             result = BaseExpression()
 525         return result
 526   
 527     def _or_expression(self):
 528         result = self._and_expression()
 529         if self._query:
 530             result = OrExpression(result)
 531         while self._query:
 532             q = self._and_expression()
 533             if q:
 534                 result.append(q)
 535         return result
 536             
 537     def _and_expression(self):
 538         result = None
 539         while not result and self._query:
 540             result = self._single_term()
 541         term = self._single_term()
 542         if term:
 543             result = AndExpression(result, term)
 544         else:
 545             return result
 546         term = self._single_term()
 547         while term:
 548             result.append(term)
 549             term = self._single_term()
 550         return result
 551                                 
 552     def _single_term(self):
 553         regex = (r'(?P<NEG>-?)\s*(' +              # leading '-'
 554                  r'(?P<OPS>\(|\)|(or\b(?!$)))|' +  # or, (, )
 555                  r'(?P<MOD>(\w+:)*)' +
 556                  r'(?P<TERM>("[^"]+")|' +
 557                   r"('[^']+')|(\S+)))")             # search word itself
 558         self._query = self._query.strip()
 559         match = re.match(regex, self._query, re.U)
 560         if not match:
 561             return None
 562         self._query = self._query[match.end():]
 563         ops = match.group("OPS")
 564         if ops == '(':
 565             result = self._or_expression()
 566             if match.group("NEG"): restult.negate()
 567             return result
 568         elif ops == ')':
 569             return None
 570         elif ops == 'or':
 571             return None
 572         modifiers = match.group('MOD').split(":")[:-1]
 573         text = match.group('TERM')
 574         if ((text[0] == text[-1] == '"') or
 575             (text[0] == text[-1] == "'")): text = text[1:-1]
 576 
 577         title_search = self.titlesearch
 578         regex = self.regex
 579         case = self.case
 580 
 581         for m in modifiers:
 582             if "title".startswith(m):
 583                 title_search = True
 584             elif "regex".startswith(m):
 585                 regex = True
 586             elif "case".startswith(m):
 587                 case = True
 588 
 589         if title_search:
 590             obj = TitleSearch(text, use_re=regex, case=case)
 591         else:
 592             obj = TextSearch(text, use_re=regex, case=case)
 593 
 594         if match.group("NEG"):
 595             obj.negate()
 596         return obj                
 597 
 598 
 599 class SearchResults:
 600     """ Manage search results, supply different views
 601 
 602     Search results can hold valid search results and format them for
 603     many requests, until the wiki content change.
 604 
 605     For example, one might ask for full page list sorted from A to Z,
 606     and then ask for the same list sorted from Z to A. Or sort results
 607     by name and then by rank.
 608     """
 609     # Public functions --------------------------------------------------
 610     
 611     def __init__(self, query, hits, pages, elapsed):
 612         self.query = query # the query
 613         self.hits = hits # hits list
 614         self.sort = None # hits are unsorted initially
 615         self.pages = pages # number of pages in the wiki
 616         self.elapsed = elapsed # search time
 617 
 618     def sortByWeight(self):
 619         """ Sorts found pages by the weight of the matches """
 620         tmp = [(hit.weight(), hit.page_name, hit) for hit in self.hits]
 621         tmp.sort()
 622         tmp.reverse()
 623         self.hits = [item[2] for item in tmp]
 624         self.sort = 'weight'
 625         
 626     def sortByPagename(self):
 627         """ Sorts a list of found pages alphabetical by page name """
 628         tmp = [(hit.page_name, hit) for hit in self.hits]
 629         tmp.sort()
 630         self.hits = [item[1] for item in tmp]
 631         self.sort = 'page_name'
 632         
 633     def stats(self, request, formatter):
 634         """ Return search statistics, formatted with formatter
 635 
 636         @param request: current request
 637         @param formatter: formatter to use
 638         @rtype: unicode
 639         @return formatted statistics
 640         """
 641         _ = request.getText
 642         f = formatter
 643         output = [
 644             f.paragraph(1),
 645             # TODO: update to "results of about" in 1.4
 646             f.text(_("%(hits)d results out of %(pages)d pages.") %
 647                    {'hits': len(self.hits), 'pages': self.pages}),
 648             u' (%s)' % f.text(_("%.2f seconds") % self.elapsed),
 649             f.paragraph(0),
 650             ]
 651         return ''.join(output)
 652 
 653     def pageList(self, request, formatter, info=0, numbered=1):
 654         """ Format a list of found pages
 655 
 656         @param request: current request
 657         @param formatter: formatter to use
 658         @param info: show match info in title
 659         @param numbered: use numbered list for display
 660         @rtype: unicode
 661         @return formatted page list
 662         """
 663         self._reset(request, formatter)
 664         f = formatter
 665         write = self.buffer.write
 666         if numbered:
 667             list = f.number_list
 668         else:
 669             list = f.bullet_list
 670         querystr = self.querystring()
 671             
 672         # Add pages formatted as list
 673         if self.hits:
 674             write(list(1))
 675 
 676             for page in self.hits:
 677                 matchInfo = ''
 678                 if info:
 679                     matchInfo = self.formatInfo(page)
 680                 item = [
 681                     f.listitem(1),
 682                     f.pagelink(1, page.page_name, querystr=querystr),
 683                     self.formatTitle(page),
 684                     f.pagelink(0, page.page_name),
 685                     matchInfo,
 686                     f.listitem(0),
 687                     ]
 688                 write(''.join(item))           
 689             write(list(0))
 690 
 691         return self.getvalue()
 692 
 693     def pageListWithContext(self, request, formatter, info=1, context=180,
 694                             maxlines=1):
 695         """ Format a list of found pages with context
 696 
 697         The default parameter values will create Google-like search
 698         results, as this is the most known search interface. Good
 699         interface is familiar interface, so unless we have much better
 700         solution (we don't), being like Google is the way.
 701 
 702         @param request: current request
 703         @param formatter: formatter to use
 704         @param info: show match info near the page link
 705         @param context: how many characters to show around each match. 
 706         @param maxlines: how many contexts lines to show. 
 707         @rtype: unicode
 708         @return formatted page list with context
 709         """
 710         self._reset(request, formatter)
 711         f = formatter
 712         write = self.buffer.write
 713         querystr = self.querystring()
 714         
 715         # Add pages formatted as definition list
 716         if self.hits:
 717             write(f.definition_list(1))       
 718 
 719             for page in self.hits:
 720                 matchInfo = ''
 721                 if info:
 722                     matchInfo = self.formatInfo(page)
 723                 item = [
 724                     f.definition_term(1),
 725                     f.pagelink(1, page.page_name, querystr=querystr),
 726                     self.formatTitle(page),
 727                     f.pagelink(0, page.page_name),
 728                     matchInfo,
 729                     f.definition_term(0),
 730                     f.definition_desc(1),
 731                     self.formatContext(page, context, maxlines),
 732                     f.definition_desc(0),
 733                     ]
 734                 write(''.join(item))
 735             write(f.definition_list(0))
 736         
 737         return self.getvalue()
 738 
 739     # Private -----------------------------------------------------------
 740 
 741     # This methods are not meant to be used by clients and may change
 742     # without notice.
 743     
 744     def formatContext(self, page, context, maxlines):
 745         """ Format search context for each matched page
 746 
 747         Try to show first maxlines interesting matches context.
 748         """
 749         f = self.formatter
 750         if not page.page:
 751             page.page = Page(self.request, page.page_name)
 752         body = page.page.get_raw_body()
 753         last = len(body) -1
 754         lineCount = 0
 755         output = []
 756         
 757         # Get unique text matches sorted by match.start, try to ignore
 758         # matches in page header, and show the first maxlines matches.
 759         # TODO: when we implement weight algorithm for text matches, we
 760         # should get the list of text matches sorted by weight and show
 761         # the first maxlines matches.
 762         matches = page.get_matches(unique=1, sort='start', type=TextMatch)
 763         i, start = self.firstInterestingMatch(page, matches)            
 764 
 765         # Format context
 766         while i < len(matches) and lineCount < maxlines:
 767             match = matches[i]
 768             
 769             # Get context range for this match
 770             start, end = self.contextRange(context, match, start, last)
 771 
 772             # Format context lines for matches. Each complete match in
 773             # the context will be highlighted, and if the full match is
 774             # in the context, we increase the index, and will not show
 775             # same match again on a separate line.
 776 
 777             output.append(f.text(u'...'))
 778             
 779             # Get the index of the first match completely within the
 780             # context.
 781             for j in xrange(0, len(matches)):
 782                 if matches[j].start >= start:
 783                     break
 784 
 785             # Add all matches in context and the text between them                
 786             while 1:
 787                 match = matches[j]
 788                 # Ignore matches behind the current position
 789                 if start < match.end:
 790                     # Append the text before match
 791                     if start < match.start:
 792                         output.append(f.text(body[start:match.start]))
 793                     # And the match
 794                     output.append(self.formatMatch(body, match, start))
 795                     start = match.end
 796                 # Get next match, but only if its completely within the context
 797                 if j < len(matches) - 1 and matches[j + 1].end <= end:
 798                     j += 1
 799                 else:
 800                     break
 801 
 802             # Add text after last match and finish the line
 803             if match.end < end:
 804                output.append(f.text(body[match.end:end]))
 805             output.append(f.text(u'...'))
 806             output.append(f.linebreak(preformatted=0))
 807 
 808             # Increase line and point to the next match
 809             lineCount += 1
 810             i = j + 1
 811 
 812         output = ''.join(output)
 813 
 814         if not output:
 815             # Return the first context characters from the page text
 816             output = f.text(page.page.getPageText(length=context))
 817             output = output.strip()
 818             if not output:
 819                 # This is a page with no text, only header, for example,
 820                 # a redirect page.
 821                 output = f.text(page.page.getPageHeader(length=context))
 822         
 823         return output
 824         
 825     def firstInterestingMatch(self, page, matches):
 826         """ Return the first interesting match
 827 
 828         This function is needed only because we don't have yet a weight
 829         algorithm for page text matches.
 830         
 831         Try to find the first match in the page text. If we can't find
 832         one, we return the first match and start=0.
 833 
 834         @rtype: tuple
 835         @return: index of first match, start of text
 836         """
 837         header = page.page.getPageHeader()
 838         start = len(header)
 839         # Find first match after start
 840         for i in xrange(len(matches)):
 841             if matches[i].start >= start:
 842                 return i, start
 843         return 0, 0
 844 
 845     def contextRange(self, context, match, start, last):
 846         """ Compute context range
 847 
 848         Add context around each match. If there is no room for context
 849         before or after the match, show more context on the other side.
 850 
 851         @param context: context length
 852         @param match: current match
 853         @param start: context should not start before that index, unless
 854                       end is past the last character.
 855         @param last: last character index
 856         @rtype: tuple
 857         @return: start, end of context
 858         """
 859         # Start by giving equal context on both sides of match
 860         contextlen = max(context - len(match), 0)
 861         cstart = match.start - contextlen / 2
 862         cend = match.end + contextlen / 2
 863 
 864         # If context start before start, give more context on end
 865         if cstart < start:
 866             cend += start - cstart
 867             cstart = start
 868             
 869         # But if end if after last, give back context to start
 870         if cend > last:
 871             cstart -= cend - last
 872             cend = last
 873 
 874         # Keep context start positive for very short texts
 875         cstart = max(cstart, 0)
 876 
 877         return cstart, cend
 878 
 879     def formatTitle(self, page):
 880         """ Format page title
 881 
 882         Invoke format match on all unique matches in page title.
 883 
 884         @param page: found page
 885         @rtype: unicode
 886         @return: formated title
 887         """
 888         # Get unique title matches sorted by match.start
 889         matches = page.get_matches(unique=1, sort='start', type=TitleMatch)
 890         
 891         # Format
 892         pagename = page.page_name
 893         f = self.formatter
 894         output = []
 895         start = 0
 896         for match in matches:
 897             # Ignore matches behind the current position
 898             if start < match.end:
 899                 # Append the text before the match
 900                 if start < match.start:
 901                     output.append(f.text(pagename[start:match.start]))
 902                 # And the match
 903                 output.append(self.formatMatch(pagename, match, start))
 904                 start = match.end
 905         # Add text after match
 906         if start < len(pagename):
 907             output.append(f.text(pagename[start:]))
 908 
 909         return ''.join(output)
 910 
 911     def formatMatch(self, body, match, location):
 912         """ Format single match in text
 913 
 914         Format the part of the match after the current location in the
 915         text. Matches behind location are ignored and an empty string is
 916         returned.
 917 
 918         @param text: text containing match
 919         @param match: search match in text
 920         @param location: current location in text
 921         @rtype: unicode
 922         @return: formated match or empty string
 923         """        
 924         start = max(location, match.start)
 925         if start < match.end:
 926             f = self.formatter
 927             output = [
 928                 f.strong(1),
 929                 f.text(body[start:match.end]),
 930                 f.strong(0),
 931                 ]
 932             return ''.join(output)
 933         return ''
 934 
 935     def querystring(self):
 936         """ Return query string, used in the page link """
 937         from MoinMoin.util import web
 938                 
 939         querystr = {'highlight': self.query.highlight_re()}
 940         querystr = web.makeQueryString(querystr)
 941         querystr = wikiutil.escape(querystr)
 942         return querystr
 943 
 944     def formatInfo(self, page):
 945         """ Return formated match info """
 946         # TODO: this will not work with non-html formats
 947         template = u'<span class="info"> . . . %s %s</span>'
 948         # Count number of unique matches in text of all types
 949         count = len(page.get_matches(unique=1))
 950         info = template % (count, self.matchLabel[count != 1])
 951         return self.formatter.rawHTML(info)         
 952 
 953     def getvalue(self):
 954         """ Return output in div with css class """
 955         write = self.request.write
 956         # TODO: this will not work with other formatter then
 957         # text_html. we should add a div/section creation method to all
 958         # formatters.
 959         value = [
 960             self.formatter.open('div', attr={'class': 'searchresults'}),
 961             self.buffer.getvalue(),
 962             self.formatter.close('div'),
 963             ]
 964         return '\n'.join(value)
 965 
 966     def _reset(self, request, formatter):
 967         """ Update internal state before new output
 968 
 969         Do not calls this, it should be called only by the instance
 970         code.
 971 
 972         Each request might need different translations or other user
 973         preferences.
 974         """
 975         self.buffer = StringIO.StringIO()
 976         self.formatter = formatter
 977         self.request = request
 978         # Use 1 match, 2 matches...
 979         _ = request.getText    
 980         self.matchLabel = (_('match'), _('matches'))
 981             
 982 
 983 ##############################################################################
 984 ### Searching
 985 ##############################################################################
 986 
 987 def searchPages(request, query, **kw):
 988     """
 989     Search the text of all pages for query.
 990     @param query: the expression we want to search for
 991     @rtype: SearchResults instance
 992     @return: search results
 993     """   
 994     from MoinMoin.Page import Page
 995     hits = []
 996 
 997     start = time.time()
 998 
 999     filter = query.pageFilter()
1000     if filter:
1001         # Get a list of readable pages, filtered by query page filter.
1002         pages = request.rootpage.getPageList(filter=filter)
1003     else:
1004         # Get an unfiltered list, then filter the hits. This works much
1005         # faster for common cases, and is even faster when you can't
1006         # read any page!  This might change if we cache the page list,
1007         # or storage will be faster.
1008         pages = request.rootpage.getPageList(user='', exists=0)
1009         
1010     # Search through pages
1011     for name in pages:
1012         page = Page(request, name)
1013         result = query.search(page)
1014         if result:
1015             if not filter:
1016                 # Filter deleted pages or pages the user can't read.
1017                 if not (page.exists() and request.user.may.read(name)):
1018                     continue
1019             hits.append(FoundPage(name, result))
1020             
1021     elapsed = time.time() - start
1022     count = request.rootpage.getPageCount()
1023     results = SearchResults(query, hits, count, elapsed)
1024     return results
1025    

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2005-01-24 14:47:51, 0.4 KB) [[attachment:search.patch]]
  • [get | view] (2005-01-24 17:04:04, 32.2 KB) [[attachment:search.py]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.