doclib/fix-links.py

   1 #!/usr/bin/python
   2 #
   3 # This tool will hack the doxygen generated documentation to fix link
   4 # errors produced by doxygen.
   5 #
   6 # This works because most anchors doxygen generates are unique 32 char
   7 # hash values. To speed up the operation, the tool will not check all
   8 # the files itself but will let 'linklint' do the grunt
   9 # work. fix-links.py reads the 'errorX.txt' and 'errorAX.txt' files
  10 # generated by linklint. These files list links to missing files
  11 # (errorX.html) and links to missing anchors
  12 # (errorAX.html). fix-links.py works in the following way:
  13 #
  14 # - Build a complete index of all unique anchors found in any html
  15 #   file. The index will only include *unique* anchors. Anchors found
  16 #   multiple times are removed from the index
  17 #
  18 # - The index is extended to include all unique names of html files
  19 #
  20 # - Scn the linklint result and check the bad links against the
  21 #   index. If the file or anchor is found in the index, an accoringly
  22 #   corrected link is generated otherwise the link is removed.
  23 #
  24 # One additional twak is, that fix-links.py will successively remove
  25 # initial 'g' charachters from anchors until the link is found in the
  26 # index. Doxygen seems to create links with the wrong number of 'g'
  27 # charachters in front sometimes.
  28
  29 import sys,os.path,fnmatch, HTMLParser, getopt, re, codecs
  30
  31 class HTMLFilter(HTMLParser.HTMLParser):
  32
  33     def __init__(self, out=None):
  34         HTMLParser.HTMLParser.__init__(self)
  35         self._out = out
  36         self._collect = False
  37         self._data = ""
  38
  39     def startCollect(self):
  40         self._collect = True
  41         self._data = ""
  42
  43     def endCollect(self):
  44         self._collect = False
  45         return self._data
  46
  47     def collecting(self):
  48         return self._collect
  49
  50     def handle_starttag(self,tag,attrs):
  51         m = getattr(self,'_s_'+tag.upper(),None)
  52         if m:
  53             m(attrs)
  54         else:
  55             self.emit_starttag(tag,attrs)
  56
  57     def handle_endtag(self,tag):
  58         m = getattr(self,'_e_'+tag.upper(),None)
  59         if m:
  60             m()
  61         else:
  62             self.emit_endtag(tag)
  63
  64     def handle_data(self,data):
  65         if self._collect:
  66             self._data += data
  67         if self._out:
  68             self._out.write(data)
  69
  70     def handle_charref(self,name):
  71         self.handle_data('&#%s;' % name)
  72
  73     def handle_entityref(self,name):
  74         self.handle_data('&%s;' % name)
  75
  76     def emit_starttag(self,tag,attrs):
  77         self.handle_data('<%s%s>' % (tag, "".join([' %s="%s"' % attr for attr in attrs])))
  78
  79     def emit_endtag(self,tag):
  80         self.handle_data('</%s>' % tag)
  81
  82
  83 class AnchorIndex:
  84
  85     def __init__(self, skipdirs = ('.svn',)):
  86         self._anchors = {}
  87         self._skipdirs = skipdirs
  88
  89     def build(self):
  90         sys.stderr.write("Building anchor index ")
  91         nf = 0
  92         for dirname, subdirs, files in os.walk('.'):
  93             for d in self._skipdirs:
  94                 if d in subdirs:
  95                     subdirs.remove(d)
  96             for f in fnmatch.filter(files,'*.html'):
  97                 nf += 1
  98                 path = os.path.normpath(os.path.join(dirname, f))
  99                 self._addAnchor(f, path)
 100                 self._extractAnchors(path)
 101         sys.stderr.write(" Done.\n")
 102         dups = 0
 103         for k in self._anchors.keys():
 104             if not self._anchors[k]:
 105                 dups += 1
 106                 del self._anchors[k]
 107         sys.stderr.write("%d unique anchors in %d files (%d duplicates)\n"
 108                          % (len(self._anchors), nf, dups))
 109
 110     def _addAnchor(self, anchor, path):
 111         if self._anchors.has_key(anchor):
 112             self._anchors[anchor] = None
 113         else:
 114             self._anchors[anchor] = path
 115             if len(self._anchors) % 100 == 0:
 116                 sys.stderr.write('.')
 117
 118     def __getitem__(self, key):
 119         return self._anchors.get(key)
 120
 121     class AnchorExtractor(HTMLFilter):
 122
 123         def __init__(self):
 124             HTMLFilter.__init__(self)
 125             self._anchors = {}
 126
 127         def _s_A(self,attrs):
 128             attrs = dict(attrs)
 129             if attrs.has_key('name'):
 130                 self._anchors[attrs['name']] = None
 131
 132         def anchors(self):
 133             return self._anchors.keys()
 134
 135     def _extractAnchors(self, f):
 136         extractor = AnchorIndex.AnchorExtractor()
 137         extractor.feed(file(f).read())
 138         extractor.close()
 139         for anchor in extractor.anchors():
 140             self._addAnchor(anchor, f)
 141
 142 TAG_RE = re.compile("<[^>]*>")
 143 REF_RE = re.compile("&[^;]*;")
 144
 145 def stripHTML(s):
 146     s = TAG_RE.sub("",s)
 147     s = s.replace("&nbsp;"," ").replace("\n"," ")
 148     s = REF_RE.sub("?",s)
 149     return s.strip()
 150
 151 class LinkFixer:
 152
 153     def __init__(self, skipdirs=('.svn',)):
 154         self._index = AnchorIndex(skipdirs)
 155
 156     def init(self):
 157         self._index.build()
 158         self._files = 0
 159         self._found = 0
 160         self._fixed = 0
 161         self._removed = {}
 162
 163     class LinkFilter(HTMLFilter):
 164
 165         def __init__(self, index, key, topdir, out):
 166             HTMLFilter.__init__(self, out)
 167             self._index = index
 168             self._key = key
 169             self._topdir = topdir
 170             self._found = 0
 171             self._fixed = 0
 172             self._removed = {}
 173
 174         def _s_A(self, attrs):
 175             self._skip_a = False
 176             if self._key in dict(attrs).get('href',''):
 177                 self._found += 1
 178                 ix = [ i for i, attr in enumerate(attrs) if attr[0] == 'href' ][0]
 179                 anchor = attrs[ix][1]
 180                 if '#' in anchor:
 181                     anchor = anchor.split('#')[1]
 182                     a = anchor
 183                     target = None
 184                     while not target:
 185                         target = self._index[a]
 186                         if target:
 187                             target = '%s#%s' % (target, a)
 188                         elif a.startswith('g'):
 189                             a = a[1:]
 190                         else:
 191                             break
 192                 else:
 193                     anchor = os.path.split(anchor)[1]
 194                     target = self._index[anchor]
 195                 if target:
 196                     self._fixed += 1
 197                     attrs[ix] = ('href', os.path.join(self._topdir,target))
 198                 else:
 199                     self._removed[anchor] = {}
 200                     self._collectFor = anchor
 201                     self.startCollect()
 202                     return
 203             self.emit_starttag('a',attrs)
 204
 205         def _e_A(self):
 206             if self.collecting():
 207                 self._removed[self._collectFor][stripHTML(self.endCollect())] = None
 208             else:
 209                 self.emit_endtag('a')
 210
 211         def stats(self):
 212             return (self._found, self._fixed, self._removed)
 213
 214     def fix(self, path, target):
 215         self._files += 1
 216         data = codecs.open(path, "r", "utf-8").read()
 217         filt = LinkFixer.LinkFilter(self._index,
 218                                     target,
 219                                     "../" * (len(os.path.split(path)[0].split("/"))),
 220                                     codecs.open(path, "w", "utf-8") )
 221         filt.feed(data)
 222         filt.close()
 223         found, fixed, removed = filt.stats()
 224         self._found += found
 225         self._fixed += fixed
 226         for anchor, labels in removed.items():
 227             for label in labels.keys():
 228                 self._removed.setdefault((anchor,label),{})[path] = None
 229
 230     def stats(self):
 231         return (self._files, self._found, self._fixed, self._removed)
 232
 233 (opts, args) = getopt.getopt(sys.argv[1:], "vs:")
 234 if len(args) != 2:
 235     sys.stderr.write("""Usage: fix-links.py [-s skip-dir]... <errrorX.txt> <errorAX.txt>
 236
 237 Process the 'errorX.txt' and 'errorAX.txt' files as generated by
 238 'linklint': Check all invalid links and try to find the correct
 239 target. If a target is found, the link is changed accordingly,
 240 otherwise the link is removed.
 241
 242 To find anchors, fix-links.py generates a complete index of all
 243 anchors defined in any HTML file in the current directory or some
 244 subdirectory. The directories named 'skiped-dir' (at any level) will
 245 not be scanned for '*.html' files.
 246 """)
 247     sys.exit(1)
 248
 249 skipdirs = [ val for opt, val in opts if opt == '-s' ]
 250 verbose = ( '-v', '' ) in opts
 251
 252 if not os.path.exists(args[0]) and not os.path.exists(args[1]):
 253     # No bad links to nothing to do
 254     sys.exit(0)
 255
 256 fixer = LinkFixer(skipdirs)
 257 fixer.init()
 258
 259 target = None
 260
 261 if os.path.exists(args[0]):
 262     for l in file(args[0]):
 263         l = l.rstrip()
 264         if l.startswith('/'):
 265             target = '#' + os.path.split(l)[1]
 266         elif l.startswith('    /') and not l.endswith('/'):
 267             sys.stderr.write("%s\n" % l)
 268             fixer.fix(l[5:], target)
 269
 270 if os.path.exists(args[1]):
 271     for l in file(args[1]):
 272         l = l.rstrip()
 273         if l.startswith('/'):
 274             target = l.split('#')[1]
 275         elif l.startswith('    /') and not l.endswith('/'):
 276             sys.stderr.write("%s\n" % l)
 277             fixer.fix(l[5:], target)
 278
 279 total, found, fixed, removed = fixer.stats()
 280
 281 if verbose:
 282     sys.stderr.write("\nRemoved links:\n")
 283     for (anchor, label), files in removed.items():
 284         sys.stderr.write("%-36.36s %-48.48s %s\n"
 285                          % ( anchor,
 286                              "(%s)" % label[:46],
 287                              " ".join(files.keys())) )
 288
 289 sys.stderr.write("""
 290 Files processed : %5d
 291 Links processed : %5d
 292 Links fixed     : %5d
 293 Links removed   : %5d
 294 """ % (total, found, fixed, found-fixed))