#!/usr/bin/python
-
-import sys,os.path,fnmatch, HTMLParser, getopt
+#
+# This tool will hack the doxygen generated documentation to fix link
+# errors produced by doxygen.
+#
+# This works because most anchors doxygen generates are unique 32 char
+# hash values. To speed up the operation, the tool will not check all
+# the files itself but will let 'linklint' do the grunt
+# work. fix-links.py reads the 'errorX.txt' and 'errorAX.txt' files
+# generated by linklint. These files list links to missing files
+# (errorX.html) and links to missing anchors
+# (errorAX.html). fix-links.py works in the following way:
+#
+# - Build a complete index of all unique anchors found in any html
+# file. The index will only include *unique* anchors. Anchors found
+# multiple times are removed from the index
+#
+# - The index is extended to include all unique names of html files
+#
+# - Scn the linklint result and check the bad links against the
+# index. If the file or anchor is found in the index, an accoringly
+# corrected link is generated otherwise the link is removed.
+#
+# One additional twak is, that fix-links.py will successively remove
+# initial 'g' charachters from anchors until the link is found in the
+# index. Doxygen seems to create links with the wrong number of 'g'
+# charachters in front sometimes.
+
+import sys,os.path,fnmatch, HTMLParser, getopt, re, codecs
class HTMLFilter(HTMLParser.HTMLParser):
self._out.write(data)
def handle_charref(self,name):
- self.handle_data(name)
+ self.handle_data('&#%s;' % name)
def handle_entityref(self,name):
- self.handle_data(name)
+ self.handle_data('&%s;' % name)
def emit_starttag(self,tag,attrs):
self.handle_data('<%s%s>' % (tag, "".join([' %s="%s"' % attr for attr in attrs])))
for anchor in extractor.anchors():
self._addAnchor(anchor, f)
+TAG_RE = re.compile("<[^>]*>")
+REF_RE = re.compile("&[^;]*;")
+
+def stripHTML(s):
+ s = TAG_RE.sub("",s)
+ s = s.replace(" "," ").replace("\n"," ")
+ s = REF_RE.sub("?",s)
+ return s.strip()
class LinkFixer:
def init(self):
self._index.build()
- self._files = 0
- self._found = 0
- self._fixed = 0
class LinkFilter(HTMLFilter):
- def __init__(self, index, key, topdir, out):
+ def __init__(self, index, topdir, out):
HTMLFilter.__init__(self, out)
self._index = index
- self._key = key
self._topdir = topdir
- self._skip_a = False
- self._found = 0
- self._fixed = 0
- def _s_A(self, attrs):
- self._skip_a = False
- if self._key in dict(attrs).get('href',''):
- self._found += 1
- ix = [ i for i, attr in enumerate(attrs) if attr[0] == 'href' ][0]
- target = attrs[ix][1]
- if '#' in target:
- anchor = target.split('#')[1]
- target = self._index[anchor]
- if target:
- target = '%s#%s' % (target, anchor)
- else:
- target = self._index[os.path.split(target)[1]]
+ def _check(self, tag, linkAttr, attrs):
+ ix = target = None
+ for i,(k,v) in enumerate(attrs):
+ if k == linkAttr:
+ ix, target = i, v
+ break
+ if target:
+ fix = False
+ tdir = anchor = None
+ if '#' in target : target, anchor = target.rsplit('#',1)
+ if '/' in target : tdir, target = target.rsplit('/', 1)
+ newTarget = None
+ if anchor:
+ newTarget = self.anchorLookup(anchor)
+ if newTarget is None:
+ newTarget = self.fileLookup(target)
+ if newTarget and anchor:
+ newTarget = '%s#%s' % (newTarget, anchor)
+ if newTarget:
+ attrs[ix] = (attrs[ix][0], '/'.join((self._topdir, newTarget)))
+ self.emit_starttag(tag, attrs)
+
+ def anchorLookup(self,anchor):
+ target = None
+ while not target:
+ target = self._index[anchor]
if target:
- self._fixed += 1
- attrs[ix] = ('href', os.path.join(self._topdir,target))
+ target = '%s#%s' % (target, anchor)
+ elif anchor.startswith('g'):
+ anchor = anchor[1:]
else:
- self._skip_a = True
- return
- self.emit_starttag('a',attrs)
-
- def _e_A(self):
- if self._skip_a:
- self._skip_a = False
- else:
- self.emit_endtag('a')
-
- def stats(self):
- return (self._found, self._fixed)
-
- def fix(self, path, target):
- self._files += 1
- data = file(path).read()
+ break
+ return target
+
+ def fileLookup(self,target):
+ return self._index[target]
+
+ def _s_A(self, attrs):
+ self._check('a', 'href', attrs)
+
+ def _s_AREA(self, attrs):
+ self._check('area', 'href', attrs)
+
+ def fix(self, path):
+ data = codecs.open(path, "r", "utf-8").read()
filt = LinkFixer.LinkFilter(self._index,
- target,
- "../" * (len(os.path.split(path)[0].split("/"))),
- file(path,"w"))
+ ("../" * (len(os.path.split(path)[0].split("/"))))[:-1],
+ codecs.open(path, "w", "utf-8") )
filt.feed(data)
filt.close()
- self._found += filt.stats()[0]
- self._fixed += filt.stats()[1]
-
- def stats(self):
- return (self._files, self._found, self._fixed)
-
-(opts, args) = getopt.getopt(sys.argv[1:], "s:")
-if len(args) != 2:
- sys.stderr.write("""Usage:
- fix-links.py [-s skip-dir]... <errrorX.txt> <errorAX.txt>
+(opts, args) = getopt.getopt(sys.argv[1:], "vs:")
+if len(args) != 0:
+ sys.stderr.write("""Usage: fix-links.py [-s skip-dir]...
-Process the 'errorX.txt' and 'errorAX.txt' files as generated by
-'linklint': Check all invalid links and try to find the correct
-target. If a target is found, the link is changed accordingly,
-otherwise the link is removed.
+Check all links and try to find the correct target. If a target is
+found, the link is changed accordingly, otherwise the link is removed.
To find anchors, fix-links.py generates a complete index of all
anchors defined in any HTML file in the current directory or some
sys.exit(1)
skipdirs = [ val for opt, val in opts if opt == '-s' ]
+verbose = ( '-v', '' ) in opts
fixer = LinkFixer(skipdirs)
fixer.init()
-target = None
-for l in file(args[0]):
- l = l.rstrip()
- if l.startswith('/'):
- target = '#' + os.path.split(l)[1]
- elif l.startswith(' /') and not l.endswith('/'):
- sys.stderr.write("%s\n" % l)
- fixer.fix(l[5:], target)
-
-for l in file(args[1]):
- l = l.rstrip()
- if l.startswith('/'):
- target = l.split('#')[1]
- elif l.startswith(' /') and not l.endswith('/'):
- sys.stderr.write("%s\n" % l)
- fixer.fix(l[5:], target)
-
-files, found, fixed = fixer.stats()
-
-sys.stderr.write("""
-Files processed : %5d
-Links processed : %5d
-Links fixed : %5d
-Links removed : %5d
-""" % (files, found, fixed, found-fixed))
+for dirname, subdirs, files in os.walk('.'):
+ for d in skipdirs:
+ if d in subdirs:
+ subdirs.remove(d)
+ for f in fnmatch.filter(files,'*.html'):
+ path = os.path.normpath(os.path.join(dirname, f))
+ print path
+ fixer.fix(path)