X-Git-Url: http://g0dil.de/git?a=blobdiff_plain;f=doclib%2Ffix-links.py;h=3958e3fa0f946b066b9bfbbf1ca5d50980caacde;hb=HEAD;hp=d0df7f6a16ef1f3e7acd1ccabdccae461cc40f84;hpb=e84dd6c52a07fc9e283cbd72c7616f3523920387;p=senf.git diff --git a/doclib/fix-links.py b/doclib/fix-links.py index d0df7f6..3958e3f 100644 --- a/doclib/fix-links.py +++ b/doclib/fix-links.py @@ -1,6 +1,32 @@ #!/usr/bin/python - -import sys,os.path,fnmatch, HTMLParser, getopt +# +# This tool will hack the doxygen generated documentation to fix link +# errors produced by doxygen. +# +# This works because most anchors doxygen generates are unique 32 char +# hash values. To speed up the operation, the tool will not check all +# the files itself but will let 'linklint' do the grunt +# work. fix-links.py reads the 'errorX.txt' and 'errorAX.txt' files +# generated by linklint. These files list links to missing files +# (errorX.html) and links to missing anchors +# (errorAX.html). fix-links.py works in the following way: +# +# - Build a complete index of all unique anchors found in any html +# file. The index will only include *unique* anchors. Anchors found +# multiple times are removed from the index +# +# - The index is extended to include all unique names of html files +# +# - Scn the linklint result and check the bad links against the +# index. If the file or anchor is found in the index, an accoringly +# corrected link is generated otherwise the link is removed. +# +# One additional twak is, that fix-links.py will successively remove +# initial 'g' charachters from anchors until the link is found in the +# index. Doxygen seems to create links with the wrong number of 'g' +# charachters in front sometimes. + +import sys,os.path,fnmatch, HTMLParser, getopt, re, codecs class HTMLFilter(HTMLParser.HTMLParser): @@ -42,10 +68,10 @@ class HTMLFilter(HTMLParser.HTMLParser): self._out.write(data) def handle_charref(self,name): - self.handle_data(name) + self.handle_data('&#%s;' % name) def handle_entityref(self,name): - self.handle_data(name) + self.handle_data('&%s;' % name) def emit_starttag(self,tag,attrs): self.handle_data('<%s%s>' % (tag, "".join([' %s="%s"' % attr for attr in attrs]))) @@ -113,6 +139,14 @@ class AnchorIndex: for anchor in extractor.anchors(): self._addAnchor(anchor, f) +TAG_RE = re.compile("<[^>]*>") +REF_RE = re.compile("&[^;]*;") + +def stripHTML(s): + s = TAG_RE.sub("",s) + s = s.replace(" "," ").replace("\n"," ") + s = REF_RE.sub("?",s) + return s.strip() class LinkFixer: @@ -121,76 +155,71 @@ class LinkFixer: def init(self): self._index.build() - self._files = 0 - self._found = 0 - self._fixed = 0 class LinkFilter(HTMLFilter): - def __init__(self, index, key, topdir, out): + def __init__(self, index, topdir, out): HTMLFilter.__init__(self, out) self._index = index - self._key = key self._topdir = topdir - self._skip_a = False - self._found = 0 - self._fixed = 0 - def _s_A(self, attrs): - self._skip_a = False - if self._key in dict(attrs).get('href',''): - self._found += 1 - ix = [ i for i, attr in enumerate(attrs) if attr[0] == 'href' ][0] - target = attrs[ix][1] - if '#' in target: - anchor = target.split('#')[1] - target = self._index[anchor] - if target: - target = '%s#%s' % (target, anchor) - else: - target = self._index[os.path.split(target)[1]] + def _check(self, tag, linkAttr, attrs): + ix = target = None + for i,(k,v) in enumerate(attrs): + if k == linkAttr: + ix, target = i, v + break + if target: + fix = False + tdir = anchor = None + if '#' in target : target, anchor = target.rsplit('#',1) + if '/' in target : tdir, target = target.rsplit('/', 1) + newTarget = None + if anchor: + newTarget = self.anchorLookup(anchor) + if newTarget is None: + newTarget = self.fileLookup(target) + if newTarget and anchor: + newTarget = '%s#%s' % (newTarget, anchor) + if newTarget: + attrs[ix] = (attrs[ix][0], '/'.join((self._topdir, newTarget))) + self.emit_starttag(tag, attrs) + + def anchorLookup(self,anchor): + target = None + while not target: + target = self._index[anchor] if target: - self._fixed += 1 - attrs[ix] = ('href', os.path.join(self._topdir,target)) + target = '%s#%s' % (target, anchor) + elif anchor.startswith('g'): + anchor = anchor[1:] else: - self._skip_a = True - return - self.emit_starttag('a',attrs) - - def _e_A(self): - if self._skip_a: - self._skip_a = False - else: - self.emit_endtag('a') - - def stats(self): - return (self._found, self._fixed) - - def fix(self, path, target): - self._files += 1 - data = file(path).read() + break + return target + + def fileLookup(self,target): + return self._index[target] + + def _s_A(self, attrs): + self._check('a', 'href', attrs) + + def _s_AREA(self, attrs): + self._check('area', 'href', attrs) + + def fix(self, path): + data = codecs.open(path, "r", "utf-8").read() filt = LinkFixer.LinkFilter(self._index, - target, - "../" * (len(os.path.split(path)[0].split("/"))), - file(path,"w")) + ("../" * (len(os.path.split(path)[0].split("/"))))[:-1], + codecs.open(path, "w", "utf-8") ) filt.feed(data) filt.close() - self._found += filt.stats()[0] - self._fixed += filt.stats()[1] - - def stats(self): - return (self._files, self._found, self._fixed) - -(opts, args) = getopt.getopt(sys.argv[1:], "s:") -if len(args) != 2: - sys.stderr.write("""Usage: - fix-links.py [-s skip-dir]... +(opts, args) = getopt.getopt(sys.argv[1:], "vs:") +if len(args) != 0: + sys.stderr.write("""Usage: fix-links.py [-s skip-dir]... -Process the 'errorX.txt' and 'errorAX.txt' files as generated by -'linklint': Check all invalid links and try to find the correct -target. If a target is found, the link is changed accordingly, -otherwise the link is removed. +Check all links and try to find the correct target. If a target is +found, the link is changed accordingly, otherwise the link is removed. To find anchors, fix-links.py generates a complete index of all anchors defined in any HTML file in the current directory or some @@ -200,32 +229,16 @@ not be scanned for '*.html' files. sys.exit(1) skipdirs = [ val for opt, val in opts if opt == '-s' ] +verbose = ( '-v', '' ) in opts fixer = LinkFixer(skipdirs) fixer.init() -target = None -for l in file(args[0]): - l = l.rstrip() - if l.startswith('/'): - target = '#' + os.path.split(l)[1] - elif l.startswith(' /') and not l.endswith('/'): - sys.stderr.write("%s\n" % l) - fixer.fix(l[5:], target) - -for l in file(args[1]): - l = l.rstrip() - if l.startswith('/'): - target = l.split('#')[1] - elif l.startswith(' /') and not l.endswith('/'): - sys.stderr.write("%s\n" % l) - fixer.fix(l[5:], target) - -files, found, fixed = fixer.stats() - -sys.stderr.write(""" -Files processed : %5d -Links processed : %5d -Links fixed : %5d -Links removed : %5d -""" % (files, found, fixed, found-fixed)) +for dirname, subdirs, files in os.walk('.'): + for d in skipdirs: + if d in subdirs: + subdirs.remove(d) + for f in fnmatch.filter(files,'*.html'): + path = os.path.normpath(os.path.join(dirname, f)) + print path + fixer.fix(path)