X-Git-Url: http://g0dil.de/git?a=blobdiff_plain;f=doclib%2Ffix-links.py;h=fb7450eb4d564cb90ae0d62fe5935e365e513047;hb=61b2e2ea5cb50df90931acf3fcd840493ba762a9;hp=d0df7f6a16ef1f3e7acd1ccabdccae461cc40f84;hpb=e84dd6c52a07fc9e283cbd72c7616f3523920387;p=senf.git diff --git a/doclib/fix-links.py b/doclib/fix-links.py index d0df7f6..fb7450e 100644 --- a/doclib/fix-links.py +++ b/doclib/fix-links.py @@ -1,6 +1,32 @@ #!/usr/bin/python - -import sys,os.path,fnmatch, HTMLParser, getopt +# +# This tool will hack the doxygen generated documentation to fix link +# errors produced by doxygen. +# +# This works because most anchors doxygen generates are unique 32 char +# hash values. To speed up the operation, the tool will not check all +# the files itself but will let 'linklint' do the grunt +# work. fix-links.py reads the 'errorX.txt' and 'errorAX.txt' files +# generated by linklint. These files list links to missing files +# (errorX.html) and links to missing anchors +# (errorAX.html). fix-links.py works in the following way: +# +# - Build a complete index of all unique anchors found in any html +# file. The index will only include *unique* anchors. Anchors found +# multiple times are removed from the index +# +# - The index is extended to include all unique names of html files +# +# - Scn the linklint result and check the bad links against the +# index. If the file or anchor is found in the index, an accoringly +# corrected link is generated otherwise the link is removed. +# +# One additional twak is, that fix-links.py will successively remove +# initial 'g' charachters from anchors until the link is found in the +# index. Doxygen seems to create links with the wrong number of 'g' +# charachters in front sometimes. + +import sys,os.path,fnmatch, HTMLParser, getopt, re, codecs class HTMLFilter(HTMLParser.HTMLParser): @@ -42,10 +68,10 @@ class HTMLFilter(HTMLParser.HTMLParser): self._out.write(data) def handle_charref(self,name): - self.handle_data(name) + self.handle_data('&#%s;' % name) def handle_entityref(self,name): - self.handle_data(name) + self.handle_data('&%s;' % name) def emit_starttag(self,tag,attrs): self.handle_data('<%s%s>' % (tag, "".join([' %s="%s"' % attr for attr in attrs]))) @@ -113,7 +139,15 @@ class AnchorIndex: for anchor in extractor.anchors(): self._addAnchor(anchor, f) +TAG_RE = re.compile("<[^>]*>") +REF_RE = re.compile("&[^;]*;") +def stripHTML(s): + s = TAG_RE.sub("",s) + s = s.replace(" "," ").replace("\n"," ") + s = REF_RE.sub("?",s) + return s.strip() + class LinkFixer: def __init__(self, skipdirs=('.svn',)): @@ -124,6 +158,7 @@ class LinkFixer: self._files = 0 self._found = 0 self._fixed = 0 + self._removed = {} class LinkFilter(HTMLFilter): @@ -132,60 +167,72 @@ class LinkFixer: self._index = index self._key = key self._topdir = topdir - self._skip_a = False self._found = 0 self._fixed = 0 + self._removed = {} def _s_A(self, attrs): self._skip_a = False if self._key in dict(attrs).get('href',''): self._found += 1 ix = [ i for i, attr in enumerate(attrs) if attr[0] == 'href' ][0] - target = attrs[ix][1] - if '#' in target: - anchor = target.split('#')[1] - target = self._index[anchor] - if target: - target = '%s#%s' % (target, anchor) + anchor = attrs[ix][1] + if '#' in anchor: + anchor = anchor.split('#')[1] + a = anchor + target = None + while not target: + target = self._index[a] + if target: + target = '%s#%s' % (target, a) + elif a.startswith('g'): + a = a[1:] + else: + break else: - target = self._index[os.path.split(target)[1]] + anchor = os.path.split(anchor)[1] + target = self._index[anchor] if target: self._fixed += 1 attrs[ix] = ('href', os.path.join(self._topdir,target)) else: - self._skip_a = True + self._removed[anchor] = {} + self._collectFor = anchor + self.startCollect() return self.emit_starttag('a',attrs) def _e_A(self): - if self._skip_a: - self._skip_a = False + if self.collecting(): + self._removed[self._collectFor][stripHTML(self.endCollect())] = None else: self.emit_endtag('a') def stats(self): - return (self._found, self._fixed) + return (self._found, self._fixed, self._removed) def fix(self, path, target): self._files += 1 - data = file(path).read() + data = codecs.open(path, "r", "utf-8").read() filt = LinkFixer.LinkFilter(self._index, target, "../" * (len(os.path.split(path)[0].split("/"))), - file(path,"w")) + codecs.open(path, "w", "utf-8") ) filt.feed(data) filt.close() - self._found += filt.stats()[0] - self._fixed += filt.stats()[1] + found, fixed, removed = filt.stats() + self._found += found + self._fixed += fixed + for anchor, labels in removed.items(): + for label in labels.keys(): + self._removed.setdefault((anchor,label),{})[path] = None def stats(self): - return (self._files, self._found, self._fixed) + return (self._files, self._found, self._fixed, self._removed) - -(opts, args) = getopt.getopt(sys.argv[1:], "s:") +(opts, args) = getopt.getopt(sys.argv[1:], "vs:") if len(args) != 2: - sys.stderr.write("""Usage: - fix-links.py [-s skip-dir]... + sys.stderr.write("""Usage: fix-links.py [-s skip-dir]... Process the 'errorX.txt' and 'errorAX.txt' files as generated by 'linklint': Check all invalid links and try to find the correct @@ -200,32 +247,48 @@ not be scanned for '*.html' files. sys.exit(1) skipdirs = [ val for opt, val in opts if opt == '-s' ] +verbose = ( '-v', '' ) in opts + +if not os.path.exists(args[0]) and not os.path.exists(args[1]): + # No bad links to nothing to do + sys.exit(0) fixer = LinkFixer(skipdirs) fixer.init() target = None -for l in file(args[0]): - l = l.rstrip() - if l.startswith('/'): - target = '#' + os.path.split(l)[1] - elif l.startswith(' /') and not l.endswith('/'): - sys.stderr.write("%s\n" % l) - fixer.fix(l[5:], target) - -for l in file(args[1]): - l = l.rstrip() - if l.startswith('/'): - target = l.split('#')[1] - elif l.startswith(' /') and not l.endswith('/'): - sys.stderr.write("%s\n" % l) - fixer.fix(l[5:], target) - -files, found, fixed = fixer.stats() + +if os.path.exists(args[0]): + for l in file(args[0]): + l = l.rstrip() + if l.startswith('/'): + target = '#' + os.path.split(l)[1] + elif l.startswith(' /') and not l.endswith('/'): + sys.stderr.write("%s\n" % l) + fixer.fix(l[5:], target) + +if os.path.exists(args[1]): + for l in file(args[1]): + l = l.rstrip() + if l.startswith('/'): + target = l.split('#')[1] + elif l.startswith(' /') and not l.endswith('/'): + sys.stderr.write("%s\n" % l) + fixer.fix(l[5:], target) + +total, found, fixed, removed = fixer.stats() + +if verbose: + sys.stderr.write("\nRemoved links:\n") + for (anchor, label), files in removed.items(): + sys.stderr.write("%-36.36s %-48.48s %s\n" + % ( anchor, + "(%s)" % label[:46], + " ".join(files.keys())) ) sys.stderr.write(""" Files processed : %5d Links processed : %5d Links fixed : %5d Links removed : %5d -""" % (files, found, fixed, found-fixed)) +""" % (total, found, fixed, found-fixed))