This is a script to extract external links from spammed pages making ready to paste link into BadContent.

   1 """
   2 Extract links from spam and return ready to paste regular expressions.
   3 """
   4 
   5 import sys
   6 import re
   7 import urlparse
   8 
   9 urlPattern = re.compile(r'\bhttps?://[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]+',
  10                         re.IGNORECASE)
  11 
  12 def extractPatterns(text):
  13     patterns = {}
  14     for link in urlPattern.findall(text):
  15         # antispam care only about the network location
  16         netloc = urlparse.urlparse(link)[1]
  17         # Ignore www subdomain
  18         netloc = netloc.replace('www.', '')
  19         netloc = netloc.replace('.', '\.')
  20         patterns[netloc] = None
  21     return patterns.keys()
  22     
  23 
  24 def run():
  25     text = file(sys.argv[1]).read()
  26     patterns = extractPatterns(text)
  27     print '\n'.join(patterns)
  28 
  29 
  30 if __name__ == '__main__':
  31     run()
  32         
spam.py

MoinMoin: ScriptMarket/ExtractBadContentScript (last edited 2007-10-29 19:10:03 by localhost)