doclib/linklint_addnames.py

   1 #!/usr/bin/python
   2
   3 import HTMLParser, re, sys, os.path
   4
   5 class ScanLinks(HTMLParser.HTMLParser):
   6
   7     def __init__(self, target, base):
   8         HTMLParser.HTMLParser.__init__(self)
   9         self._target = target
  10         self._base = base
  11         self._collect = False
  12         self._data = ""
  13         self._value = ""
  14
  15     def startCollect(self):
  16         self._collect = True
  17         self._data = ""
  18
  19     def endCollect(self):
  20         self._collect = False
  21         return self._data
  22
  23     def collecting(self):
  24         return self._collect
  25
  26     def handle_starttag(self,tag,attrs):
  27         m = getattr(self,'_s_'+tag.upper(),None)
  28         if m:
  29             m(attrs)
  30
  31     def handle_endtag(self,tag):
  32         m = getattr(self,'_e_'+tag.upper(),None)
  33         if m:
  34             m()
  35
  36     def handle_data(self,data):
  37         if self._collect:
  38             self._data += data
  39
  40     def handle_charref(self,name):
  41         self.handle_data(name)
  42
  43     def handle_entityref(self,name):
  44         self.handle_data(name)
  45
  46     def value(self):
  47         return self._value
  48
  49     ###########################################################################
  50
  51     SCHEME_RE=re.compile("[a-z]+:")
  52
  53     def _s_A(self,attrs):
  54         attrs = dict(attrs)
  55         url = attrs.get('href')
  56         if url and not self.SCHEME_RE.match(url):
  57             if '#' in self._target:
  58                 p = os.path.abspath(os.path.join(self._base,url))
  59             else:
  60                 p = os.path.abspath(os.path.join(self._base,url.split('#')[0]))
  61             if  p == self._target:
  62                 self.startCollect()
  63
  64     def _e_A(self):
  65         if self.collecting():
  66             self._value = self.endCollect()
  67
  68 WS_RE=re.compile("\\s+")
  69
  70 def linkName(target,f):
  71     scanner = ScanLinks(target,os.path.split(os.path.abspath(f))[0])
  72     scanner.feed(file(f).read())
  73     return WS_RE.sub(' ',scanner.value().strip())
  74
  75 process = 0
  76 target = 0
  77 for line in sys.stdin:
  78     if line.startswith('<a href='):
  79         target = line.split(':')[1]
  80         target = target[2:].split('"')[0]
  81     elif line.startswith('    <a href='):
  82         f = line.split(':')[1]
  83         f = f[2:].split('"')[0]
  84         line = '%s (%s)</a>\n'  % (line[:-5], linkName(target,f))
  85     sys.stdout.write(line)