doclib/fix-links.py

   1 #!/usr/bin/python
   2 #
   3 # This tool will hack the doxygen generated documentation to fix link
   4 # errors produced by doxygen.
   5 #
   6 # This works because most anchors doxygen generates are unique 32 char
   7 # hash values. To speed up the operation, the tool will not check all
   8 # the files itself but will let 'linklint' do the grunt
   9 # work. fix-links.py reads the 'errorX.txt' and 'errorAX.txt' files
  10 # generated by linklint. These files list links to missing files
  11 # (errorX.html) and links to missing anchors
  12 # (errorAX.html). fix-links.py works in the following way:
  13 #
  14 # - Build a complete index of all unique anchors found in any html
  15 #   file. The index will only include *unique* anchors. Anchors found
  16 #   multiple times are removed from the index
  17 #
  18 # - The index is extended to include all unique names of html files
  19 #
  20 # - Scn the linklint result and check the bad links against the
  21 #   index. If the file or anchor is found in the index, an accoringly
  22 #   corrected link is generated otherwise the link is removed.
  23 #
  24 # One additional twak is, that fix-links.py will successively remove
  25 # initial 'g' charachters from anchors until the link is found in the
  26 # index. Doxygen seems to create links with the wrong number of 'g'
  27 # charachters in front sometimes.
  28
  29 import sys,os.path,fnmatch, HTMLParser, getopt, re, codecs
  30
  31 class HTMLFilter(HTMLParser.HTMLParser):
  32
  33     def __init__(self, out=None):
  34         HTMLParser.HTMLParser.__init__(self)
  35         self._out = out
  36         self._collect = False
  37         self._data = ""
  38
  39     def startCollect(self):
  40         self._collect = True
  41         self._data = ""
  42
  43     def endCollect(self):
  44         self._collect = False
  45         return self._data
  46
  47     def collecting(self):
  48         return self._collect
  49
  50     def handle_starttag(self,tag,attrs):
  51         m = getattr(self,'_s_'+tag.upper(),None)
  52         if m:
  53             m(attrs)
  54         else:
  55             self.emit_starttag(tag,attrs)
  56
  57     def handle_endtag(self,tag):
  58         m = getattr(self,'_e_'+tag.upper(),None)
  59         if m:
  60             m()
  61         else:
  62             self.emit_endtag(tag)
  63
  64     def handle_data(self,data):
  65         if self._collect:
  66             self._data += data
  67         if self._out:
  68             self._out.write(data)
  69
  70     def handle_charref(self,name):
  71         self.handle_data('&#%s;' % name)
  72
  73     def handle_entityref(self,name):
  74         self.handle_data('&%s;' % name)
  75
  76     def emit_starttag(self,tag,attrs):
  77         self.handle_data('<%s%s>' % (tag, "".join([' %s="%s"' % attr for attr in attrs])))
  78
  79     def emit_endtag(self,tag):
  80         self.handle_data('</%s>' % tag)
  81
  82
  83 class AnchorIndex:
  84
  85     def __init__(self, skipdirs = ('.svn',)):
  86         self._anchors = {}
  87         self._skipdirs = skipdirs
  88
  89     def build(self):
  90         sys.stderr.write("Building anchor index ")
  91         nf = 0
  92         for dirname, subdirs, files in os.walk('.'):
  93             for d in self._skipdirs:
  94                 if d in subdirs:
  95                     subdirs.remove(d)
  96             for f in fnmatch.filter(files,'*.html'):
  97                 nf += 1
  98                 path = os.path.normpath(os.path.join(dirname, f))
  99                 self._addAnchor(f, path)
 100                 self._extractAnchors(path)
 101         sys.stderr.write(" Done.\n")
 102         dups = 0
 103         for k in self._anchors.keys():
 104             if not self._anchors[k]:
 105                 dups += 1
 106                 del self._anchors[k]
 107         sys.stderr.write("%d unique anchors in %d files (%d duplicates)\n"
 108                          % (len(self._anchors), nf, dups))
 109
 110     def _addAnchor(self, anchor, path):
 111         if self._anchors.has_key(anchor):
 112             self._anchors[anchor] = None
 113         else:
 114             self._anchors[anchor] = path
 115             if len(self._anchors) % 100 == 0:
 116                 sys.stderr.write('.')
 117
 118     def __getitem__(self, key):
 119         return self._anchors.get(key)
 120
 121     class AnchorExtractor(HTMLFilter):
 122
 123         def __init__(self):
 124             HTMLFilter.__init__(self)
 125             self._anchors = {}
 126
 127         def _s_A(self,attrs):
 128             attrs = dict(attrs)
 129             if attrs.has_key('name'):
 130                 self._anchors[attrs['name']] = None
 131
 132         def anchors(self):
 133             return self._anchors.keys()
 134
 135     def _extractAnchors(self, f):
 136         extractor = AnchorIndex.AnchorExtractor()
 137         extractor.feed(file(f).read())
 138         extractor.close()
 139         for anchor in extractor.anchors():
 140             self._addAnchor(anchor, f)
 141
 142 TAG_RE = re.compile("<[^>]*>")
 143 REF_RE = re.compile("&[^;]*;")
 144
 145 def stripHTML(s):
 146     s = TAG_RE.sub("",s)
 147     s = s.replace("&nbsp;"," ").replace("\n"," ")
 148     s = REF_RE.sub("?",s)
 149     return s.strip()
 150
 151 class LinkFixer:
 152
 153     def __init__(self, skipdirs=('.svn',)):
 154         self._index = AnchorIndex(skipdirs)
 155
 156     def init(self):
 157         self._index.build()
 158
 159     class LinkFilter(HTMLFilter):
 160
 161         def __init__(self, index, topdir, out):
 162             HTMLFilter.__init__(self, out)
 163             self._index = index
 164             self._topdir = topdir
 165
 166         def _check(self, tag, linkAttr, attrs):
 167             ix = target = None
 168             for i,(k,v) in enumerate(attrs):
 169                 if k == linkAttr:
 170                     ix, target = i, v
 171                     break
 172             if target:
 173                 fix = False
 174                 tdir = anchor = None
 175                 if '#' in target : target, anchor = target.rsplit('#',1)
 176                 if '/' in target : tdir, target = target.rsplit('/', 1)
 177                 newTarget = None
 178                 if anchor:
 179                     newTarget = self.anchorLookup(anchor)
 180                 if newTarget is None:
 181                     newTarget = self.fileLookup(target)
 182                     if newTarget and anchor:
 183                         newTarget = '%s#%s' % (newTarget, anchor)
 184                 if newTarget:
 185                     attrs[ix] = (attrs[ix][0], '/'.join((self._topdir, newTarget)))
 186             self.emit_starttag(tag, attrs)
 187
 188         def anchorLookup(self,anchor):
 189             target = None
 190             while not target:
 191                 target = self._index[anchor]
 192                 if target:
 193                     target = '%s#%s' % (target, anchor)
 194                 elif anchor.startswith('g'):
 195                     anchor = anchor[1:]
 196                 else:
 197                     break
 198             return target
 199
 200         def fileLookup(self,target):
 201             return self._index[target]
 202
 203         def _s_A(self, attrs):
 204             self._check('a', 'href', attrs)
 205
 206         def _s_AREA(self, attrs):
 207             self._check('area', 'href', attrs)
 208
 209     def fix(self, path):
 210         data = codecs.open(path, "r", "utf-8").read()
 211         filt = LinkFixer.LinkFilter(self._index,
 212                                     ("../" * (len(os.path.split(path)[0].split("/"))))[:-1],
 213                                     codecs.open(path, "w", "utf-8") )
 214         filt.feed(data)
 215         filt.close()
 216
 217 (opts, args) = getopt.getopt(sys.argv[1:], "vs:")
 218 if len(args) != 0:
 219     sys.stderr.write("""Usage: fix-links.py [-s skip-dir]...
 220
 221 Check all links and try to find the correct target. If a target is
 222 found, the link is changed accordingly, otherwise the link is removed.
 223
 224 To find anchors, fix-links.py generates a complete index of all
 225 anchors defined in any HTML file in the current directory or some
 226 subdirectory. The directories named 'skiped-dir' (at any level) will
 227 not be scanned for '*.html' files.
 228 """)
 229     sys.exit(1)
 230
 231 skipdirs = [ val for opt, val in opts if opt == '-s' ]
 232 verbose = ( '-v', '' ) in opts
 233
 234 fixer = LinkFixer(skipdirs)
 235 fixer.init()
 236
 237 for dirname, subdirs, files in os.walk('.'):
 238     for d in skipdirs:
 239         if d in subdirs:
 240             subdirs.remove(d)
 241     for f in fnmatch.filter(files,'*.html'):
 242         path = os.path.normpath(os.path.join(dirname, f))
 243         print path
 244         fixer.fix(path)