attachment:moin-1.3_attsearch.diff of MoinMoinTodo/ExtendedSearch/AttachmentSearch

Attachment 'moin-1.3_attsearch.diff'

   1 diff -u -r -P /home/wr/install/moin-1.3/MoinMoin/action/AttachFile.py ./action/AttachFile.py
   2 --- /home/wr/install/moin-1.3/MoinMoin/action/AttachFile.py	2004-08-18 07:00:26.000000000 +0200
   3 +++ ./action/AttachFile.py	2004-08-25 15:59:38.727164243 +0200
   4 @@ -51,19 +51,33 @@
   5      """ Get directory where attachments for page `pagename` are stored.
   6      """
   7      if htdocs_access(request):
   8 +        print "a"
   9          # direct file access via webserver, from public htdocs area
  10          pagename = wikiutil.quoteWikinameFS(pagename)
  11          attach_dir = os.path.join(request.cfg.attachments['dir'], pagename, "attachments")
  12      else:
  13 +        print "b"
  14          # send file via CGI, from page storage area
  15          attach_dir = wikiutil.getPagePath(request, pagename, "attachments", check_create=create)
  16  
  17      if create and not os.path.isdir(attach_dir): 
  18          filesys.makeDirs(attach_dir)
  19  
  20 +    print "attach_dir=",attach_dir
  21      return attach_dir
  22  
  23 +def getTextVersionDir(request, pagename, create=0):
  24 +    """ Get directory where the converted text version is stored.
  25 +    It is for PageName: data/cache/AttachSearch/PageName/filename
  26 +    """
  27 +    txt_dir = os.path.join(request.cfg.data_dir, "cache", "AttachSearch", pagename)
  28 +    
  29 +    if create and not os.path.isdir(txt_dir): 
  30 +        filesys.makeDirs(txt_dir)
  31  
  32 +    return txt_dir
  33 +    
  34 +    
  35  def getAttachUrl(pagename, filename, request, addts=0):
  36      """ Get URL that points to attachment `filename` of page `pagename`.
  37 
  38 diff -u -r -P /home/wr/install/moin-1.3/MoinMoin/action/fullsearch.py ./action/fullsearch.py
  39 --- /home/wr/install/moin-1.3/MoinMoin/action/fullsearch.py	2004-08-18 07:00:31.000000000 +0200
  40 +++ ./action/fullsearch.py	2004-08-23 16:33:20.000000000 +0200
  41 @@ -38,7 +38,8 @@
  42      query = search.QueryParser(literal=request.form.has_key('literal'),
  43                                 case=case).parse_query(needle)
  44  
  45 -    hits = search.searchPages(request, query)
  46 +    # hits = search.searchPages(request, query)
  47 +    hits = search.searchEverything(request, query)
  48  
  49      search.sort_by_weight(hits)
  50      formatter = Formatter(request)
  51 diff -u -r -P /home/wr/install/moin-1.3/MoinMoin/attach2txt/__init__.py ./attach2txt/__init__.py
  52 --- /home/wr/install/moin-1.3/MoinMoin/attach2txt/__init__.py	1970-01-01 01:00:00.000000000 +0100
  53 +++ ./attach2txt/__init__.py	2004-08-24 10:49:38.000000000 +0200
  54 @@ -0,0 +1,16 @@
  55 +# -*- coding: iso-8859-1 -*-
  56 +"""
  57 +    MoinMoin - Attachment converter package
  58 +
  59 +    @copyright: 2004 Willi Richert <w.richert@gmx.net>
  60 +    @license: GNU GPL, see COPYING for details.
  61 +"""
  62 +
  63 +
  64 +from MoinMoin.util import pysupport
  65 +modules = pysupport.getPackageModules(__file__)
  66 +print "modules in init.py:", modules
  67 +
  68 +import pdf2txt
  69 +
  70 +converter_mapping = {"pdf":pdf2txt.convert}
  71 
  72 diff -u -r -P /home/wr/install/moin-1.3/MoinMoin/attach2txt/pdf2txt.py ./attach2txt/pdf2txt.py
  73 --- /home/wr/install/moin-1.3/MoinMoin/attach2txt/pdf2txt.py	1970-01-01 01:00:00.000000000 +0100
  74 +++ ./attach2txt/pdf2txt.py	2004-08-25 16:17:40.954215485 +0200
  75 @@ -0,0 +1,26 @@
  76 +# -*- coding: iso-8859-1 -*-
  77 +"""
  78 +    MoinMoin - pdf to txt converter.
  79 +
  80 +    You will need pdftotext (xpdf package) in the PATH. Works only on Linux.
  81 +    
  82 +    @copyright: 2004 by Willi Richert (w.richert@gmx.net)
  83 +    @license: GNU GPL, see COPYING for details.
  84 +"""
  85 +
  86 +import os, mimetypes, time, urllib
  87 +from MoinMoin import config, user, util, wikiutil
  88 +from MoinMoin.Page import Page
  89 +from MoinMoin.util import MoinMoinNoFooter, filesys
  90 +
  91 +converter_name = __name__.split('.')[-1]
  92 +
  93 +def convert(att_fn, txt_fn):
  94 +    print att_fn
  95 +    print 'pdftotext "%s" "%s"'%(att_fn, txt_fn)
  96 +    ret = os.system("pdftotext \"%s\" \"%s\""%(att_fn, txt_fn))
  97 +    if ret != 0:
  98 +        open(txt_fn, "w").close() # empty the file        
  99 +        return 1
 100 +    else:
 101 +        return 0
 102 
 103 diff -u -r -P /home/wr/install/moin-1.3/MoinMoin/Attachment.py ./Attachment.py
 104 --- /home/wr/install/moin-1.3/MoinMoin/Attachment.py	1970-01-01 01:00:00.000000000 +0100
 105 +++ ./Attachment.py	2004-08-25 16:14:50.309377429 +0200
 106 @@ -0,0 +1,187 @@
 107 +# -*- coding: iso-8859-1 -*-
 108 +"""
 109 +    MoinMoin - Attachment class. Derived from Page.py.
 110 +
 111 +    @copyright: 2004 by Willi Richert <w.richert@gmx.net>
 112 +    @license: GNU GPL, see COPYING for details.
 113 +"""
 114 +
 115 +# Imports
 116 +import os.path
 117 +from MoinMoin import config
 118 +#import MoinMoin.util.web
 119 +from MoinMoin.logfile import eventlog
 120 +from MoinMoin.action import AttachFile
 121 +from MoinMoin.attach2txt import converter_mapping
 122 +from MoinMoin.wikiutil import quoteWikinameFS
 123 +
 124 +class Attachment:
 125 +    """Attachment - Manage an (immutable) attachment associated with a page.
 126 +    For search, the attachments text versions are saved in
 127 +    data/cache/AttachSearch/PageName/filename
 128 +    """
 129 +
 130 +    def __init__(self, request, att_name, page, **keywords):
 131 +        """
 132 +        Create attachment object.
 133 +
 134 +        @param page_name: WikiName of the associated page
 135 +        @keyword: ignored
 136 +        """
 137 +        self.request = request
 138 +        self.att_name = att_name
 139 +        
 140 +        self._assoc_page_name = page.page_name
 141 +
 142 +        self.att_filename = os.path.join(AttachFile.getAttachDir(self.request, self._assoc_page_name), self.att_name)
 143 +        txt_dir = AttachFile.getTextVersionDir(self.request, self._assoc_page_name, create=1)
 144 +        self.txt_filename = os.path.join(txt_dir, self.att_name+".txt")
 145 +        
 146 +        self.suffix = os.path.splitext(self.att_name)[1][1:] # suffix without the dot
 147 +        self._raw_body = None
 148 +        self._raw_body_modified = 0
 149 +        self.hilite_re = None
 150 +        
 151 +
 152 +
 153 +    def exists(self):
 154 +        """
 155 +        Does this page exist?
 156 +        
 157 +        @rtype: bool
 158 +        @return: true, if page exists
 159 +        """
 160 +        return os.path.exists(self.att_filename)
 161 +
 162 +
 163 +    def size(self):
 164 +        """
 165 +        Get Attachment size.
 166 +        
 167 +        @rtype: int
 168 +        @return: attachment size, 0 for non-existent pages.
 169 +        """
 170 +        if self._raw_body is not None:
 171 +            return len(self._raw_body)
 172 +
 173 +        try:
 174 +            return os.path.getsize(self.att_filename)
 175 +        except EnvironmentError, e:
 176 +            import errno
 177 +            if e.errno == errno.ENOENT: return 0
 178 +            raise
 179 +
 180 +    def getTextVersion(self, request, create=0):
 181 +        """ Returns the extracted textual content of the attachment, if possible.
 182 +        @param att: att string without pagename: e.g. "file.pdf"
 183 +        @rtype: string
 184 +        @return: Textual content of the attachment.
 185 +        """
 186 +        # if we've come so far, the attachment dir does exist together with the attachment
 187 +        #att_dir = AttachFile.getAttachDir(request, self._assoc_page_name)
 188 +        att_file = self.att_filename
 189 +        txt_file = self.txt_filename
 190 +        
 191 +        alreadyConverted=1
 192 +        try:
 193 +            print "considering:",att_file,txt_file
 194 +            if not os.path.isfile(txt_file):
 195 +                alreadyConverted=0
 196 +                print 1
 197 +            else:
 198 +                att_ctime = os.path.getmtime(att_file)
 199 +                txt_ctime = os.path.getmtime(txt_file)
 200 +
 201 +                if att_ctime>txt_file:
 202 +                    alreadyConverted=0
 203 +                    print 2
 204 +                print 3
 205 +        except os.error:
 206 +            alreadyConverted=0
 207 +            print 4
 208 +            
 209 +        if not alreadyConverted:
 210 +            # we have to convert this attachment if the proper tools are available
 211 +            print "We have to convert the attachment %s"%att_file
 212 +            convertResult = converter_mapping[self.suffix](att_file, txt_file)
 213 +        else:
 214 +            convertResult = 0
 215 +
 216 +        if alreadyConverted or convertResult==0:
 217 +            file = open(txt_file, "r")
 218 +            content = file.read()
 219 +            return content
 220 +        else:
 221 +            return ""
 222 +            
 223 +    
 224 +    def get_raw_body(self):
 225 +        """
 226 +        Load the raw textual version of the attachment. None if not convertable.
 227 +        
 228 +        @rtype: string
 229 +        @return: raw text contents of this attachment
 230 +        """
 231 +        
 232 +        if self._raw_body is None:
 233 +            att = self.getTextVersion(self.request, self.att_name)
 234 +            self.set_raw_body(att)
 235 +            
 236 +        return self._raw_body
 237 +
 238 +
 239 +    def set_raw_body(self, body, modified=0):
 240 +        """
 241 +        Set the raw body text (prevents loading from disk).
 242 +
 243 +        @param body: raw body text
 244 +        @param modified: 1 means that we internally modified the raw text and
 245 +                         that it is not in sync with the page file on disk.
 246 +                         This is used e.g. by PageEditor when previewing the page.
 247 +        """
 248 +        self._raw_body = body
 249 +        self._raw_body_modified = modified
 250 +
 251 +    def link_to(self, request, text=None, querystr=None, anchor=None, **kw):
 252 +        """
 253 +        Return HTML markup that links to this attachment.
 254 +        See wikiutil.link_tag() for possible keyword parameters.
 255 +
 256 +        @param request: the request object
 257 +        @param text: inner text of the link
 258 +        @param querystr: the query string to add after a "?" after the url
 259 +        @param anchor: if specified, make a link to this anchor
 260 +        @keyword on: opening/closing tag only
 261 +        @keyword attachment_indicator: if 1, add attachment indicator after link tag
 262 +        @keyword css_class: css class to use
 263 +        @rtype: string
 264 +        @return: formatted link
 265 +        """
 266 +        text = text or self.split_title(request)
 267 +        fmt = getattr(self, 'formatter', None)
 268 +        
 269 +        url = wikiutil.quoteWikinameURL(self.page_name)
 270 +        if querystr:
 271 +            querystr = util.web.makeQueryString(querystr)
 272 +            url = "%s?%s" % (url, querystr)
 273 +        if anchor: url = "%s#%s" % (url, urllib.quote_plus(anchor.encode(config.charset)))
 274 +
 275 +        # create a link to attachments if any exist
 276 +        attach_link = ''
 277 +        if kw.get('attachment_indicator', 0):
 278 +            from MoinMoin.action import AttachFile
 279 +            attach_link = AttachFile.getIndicator(request, self.page_name)
 280 +
 281 +        if self.exists():
 282 +            return wikiutil.link_tag(request, url, text, formatter=fmt, **kw) + attach_link
 283 +        else:
 284 +            kw['css_class'] = 'nonexistent'
 285 +            
 286 +        if request.user.show_nonexist_qm:
 287 +            return wikiutil.link_tag(request, url,
 288 +                '?', formatter=fmt, **kw) + text + attach_link
 289 +        else:
 290 +            return wikiutil.link_tag(request, url, text, formatter=fmt, **kw) + attach_link
 291 +
 292 +
 293 +        AttachFile.getAttachUrl(self._assoc_page_name, self.att_name, request, addts=0)
 294 
 295 diff -u -r -P /home/wr/install/moin-1.3/MoinMoin/formatter/text_html.py ./formatter/text_html.py
 296 --- /home/wr/install/moin-1.3/MoinMoin/formatter/text_html.py	2004-08-18 07:00:33.000000000 +0200
 297 +++ ./formatter/text_html.py	2004-08-24 09:02:59.000000000 +0200
 298 @@ -100,6 +100,14 @@
 299          apply(FormatterBase.pagelink, (self, on, pagename), kw)
 300          return Page(self.request, pagename, formatter=self).link_to(self.request, on=on, **kw)
 301  
 302 +    def attachlink(self, on, attname='', **kw):
 303 +        """ Link to an attachment.
 304 +
 305 +            See wikiutil.link_tag() for possible keyword parameters.
 306 +        """
 307 +        apply(FormatterBase.pagelink, (self, on, attname), kw)
 308 +        return Attachment(self.request, attname, formatter=self).link_to(self.request, on=on, **kw)
 309 +
 310      def interwikilink(self, on, interwiki='', pagename='', **kw):
 311          if not on: return '</a>'
 312 
 313 diff -u -r -P /home/wr/install/moin-1.3/MoinMoin/search.py ./search.py
 314 --- /home/wr/install/moin-1.3/MoinMoin/search.py	2004-08-18 07:00:25.000000000 +0200
 315 +++ ./search.py	2004-08-25 16:15:01.523658101 +0200
 316 @@ -5,10 +5,12 @@
 317      @license: GNU GPL, see COPYING for details
 318  """
 319  
 320 -import re, time, sys, urllib
 321 +import re, time, sys, urllib, os
 322  #sys.path.append('..')
 323  from MoinMoin import wikiutil, config
 324  from MoinMoin.Page import Page
 325 +from MoinMoin.action import AttachFile
 326 +from Attachment import Attachment
 327  
 328  #try:
 329  #    import xapian
 330 @@ -165,11 +167,11 @@
 331      def highlight_re(self):
 332          return u"(%s)" % self.pattern
 333  
 334 -    def search(self, page):
 335 -        body = page.get_raw_body()
 336 +    def search(self, obj): # obj is page or attachment
 337 +        body = obj.get_raw_body()
 338  
 339          pos = 0
 340 -        fragments = self.titlesearch.search(page)
 341 +        fragments = self.titlesearch.search(obj)
 342          if fragments is None: fragments = []
 343          while 1:
 344              match = self.search_re.search(body, pos)
 345 @@ -205,8 +207,11 @@
 346      def highlight_re(self):
 347          return u"(%s)" % self.pattern    
 348  
 349 -    def search(self, page):
 350 -        match = self.search_re.search(page.page_name)
 351 +    def search(self, obj):
 352 +        if isinstance(obj, Page):
 353 +            match = self.search_re.search(obj.page_name)
 354 +        else:
 355 +            match = self.search_re.search(obj.att_name)
 356          if ((self.negated and match) or
 357              (not self.negated and not match)):
 358              return None
 359 @@ -230,10 +235,9 @@
 360  ### Results
 361  ############################################################################
 362          
 363 -
 364 -class FoundPage:
 365 -    def __init__(self, page_name, matches=[], page=None):
 366 -        self.page_name = page_name
 367 +class FoundObject:
 368 +    def __init__(self, name, matches=[], page=None):
 369 +        self.page_name = name
 370          self.page = page
 371          self._matches = matches
 372  
 373 @@ -252,11 +256,20 @@
 374      def get_matches(self):
 375          return self._matches[:]
 376  
 377 +    
 378 +class FoundPage(FoundObject):
 379 +    pass
 380  
 381 -
 382 -class FoundAttachment(FoundPage):
 383 +class FoundAttachment(FoundObject):
 384 +    """
 385 +    The attachments text versions are saved in
 386 +    data/cache/AttachSearch/PageName/filename
 387 +    """
 388 +    # TODO: needs to be more attachment like
 389      pass
 390  
 391 +
 392 +
 393  class Match:
 394      def __init__(self, start=0, end=0):
 395          self.start = start
 396 @@ -382,6 +395,40 @@
 397  
 398      return hits
 399  
 400 +def searchEverything(request, query, **kw):
 401 +    """
 402 +    Search the text of all pages and their attachment's content for query.
 403 +    @param query: the expression we want to search for
 404 +    @rtype: list
 405 +    @return: List of FoundPage objects
 406 +    """
 407 +    from MoinMoin.Page import Page
 408 +
 409 +    hits = []
 410 +    all_pages = wikiutil.getPageList(request.cfg.text_dir)
 411 +    for page_name in all_pages:
 412 +        page = Page(request, page_name)
 413 +        if not request.user.may.read(page_name):
 414 +            continue
 415 +        result = query.search(page)
 416 +        if result:
 417 +            hits.append(FoundPage(page_name, result))
 418 +
 419 +        # search now in all attachments of this page
 420 +        attach_dir = AttachFile.getAttachDir(request, page_name)
 421 +        if os.path.exists(attach_dir):
 422 +            att_list = os.listdir(attach_dir)
 423 +            for att in att_list:
 424 +                print "searchEverything: att=",att
 425 +                
 426 +                result = query.search(Attachment(request, att, page)) # TODO: argument hides as a page
 427 +                if result:
 428 +                    # TODO: append not the page name but the attachment itself directly be means of
 429 +                    # FoundAttachment (see the class stub above)
 430 +                    hits.append(FoundAttachment(page_name, result))
 431 +       
 432 +    return hits
 433 +    
 434  
 435  ##############################################################################
 436  ### Sort results
Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
You are not allowed to attach a file to this page.
MoinMoin: attachment:moin-1.3_attsearch.diff of MoinMoinTodo/ExtendedSearch/AttachmentSearch

Attachment 'moin-1.3_attsearch.diff'

Attached Files