3 # This tool will hack the doxygen generated documentation to fix link
4 # errors produced by doxygen.
6 # This works because most anchors doxygen generates are unique 32 char
7 # hash values. To speed up the operation, the tool will not check all
8 # the files itself but will let 'linklint' do the grunt
9 # work. fix-links.py reads the 'errorX.txt' and 'errorAX.txt' files
10 # generated by linklint. These files list links to missing files
11 # (errorX.html) and links to missing anchors
12 # (errorAX.html). fix-links.py works in the following way:
14 # - Build a complete index of all unique anchors found in any html
15 # file. The index will only include *unique* anchors. Anchors found
16 # multiple times are removed from the index
18 # - The index is extended to include all unique names of html files
20 # - Scn the linklint result and check the bad links against the
21 # index. If the file or anchor is found in the index, an accoringly
22 # corrected link is generated otherwise the link is removed.
24 # One additional twak is, that fix-links.py will successively remove
25 # initial 'g' charachters from anchors until the link is found in the
26 # index. Doxygen seems to create links with the wrong number of 'g'
27 # charachters in front sometimes.
29 import sys,os.path,fnmatch, HTMLParser, getopt, re
31 class HTMLFilter(HTMLParser.HTMLParser):
33 def __init__(self, out=None):
34 HTMLParser.HTMLParser.__init__(self)
39 def startCollect(self):
50 def handle_starttag(self,tag,attrs):
51 m = getattr(self,'_s_'+tag.upper(),None)
55 self.emit_starttag(tag,attrs)
57 def handle_endtag(self,tag):
58 m = getattr(self,'_e_'+tag.upper(),None)
64 def handle_data(self,data):
70 def handle_charref(self,name):
71 self.handle_data('&#%s;' % name)
73 def handle_entityref(self,name):
74 self.handle_data('&%s;' % name)
76 def emit_starttag(self,tag,attrs):
77 self.handle_data('<%s%s>' % (tag, "".join([' %s="%s"' % attr for attr in attrs])))
79 def emit_endtag(self,tag):
80 self.handle_data('</%s>' % tag)
85 def __init__(self, skipdirs = ('.svn',)):
87 self._skipdirs = skipdirs
90 sys.stderr.write("Building anchor index ")
92 for dirname, subdirs, files in os.walk('.'):
93 for d in self._skipdirs:
96 for f in fnmatch.filter(files,'*.html'):
98 path = os.path.normpath(os.path.join(dirname, f))
99 self._addAnchor(f, path)
100 self._extractAnchors(path)
101 sys.stderr.write(" Done.\n")
103 for k in self._anchors.keys():
104 if not self._anchors[k]:
107 sys.stderr.write("%d unique anchors in %d files (%d duplicates)\n"
108 % (len(self._anchors), nf, dups))
110 def _addAnchor(self, anchor, path):
111 if self._anchors.has_key(anchor):
112 self._anchors[anchor] = None
114 self._anchors[anchor] = path
115 if len(self._anchors) % 100 == 0:
116 sys.stderr.write('.')
118 def __getitem__(self, key):
119 return self._anchors.get(key)
121 class AnchorExtractor(HTMLFilter):
124 HTMLFilter.__init__(self)
127 def _s_A(self,attrs):
129 if attrs.has_key('name'):
130 self._anchors[attrs['name']] = None
133 return self._anchors.keys()
135 def _extractAnchors(self, f):
136 extractor = AnchorIndex.AnchorExtractor()
137 extractor.feed(file(f).read())
139 for anchor in extractor.anchors():
140 self._addAnchor(anchor, f)
142 TAG_RE = re.compile("<[^>]*>")
143 REF_RE = re.compile("&[^;]*;")
147 s = s.replace(" "," ").replace("\n"," ")
148 s = REF_RE.sub("?",s)
153 def __init__(self, skipdirs=('.svn',)):
154 self._index = AnchorIndex(skipdirs)
163 class LinkFilter(HTMLFilter):
165 def __init__(self, index, key, topdir, out):
166 HTMLFilter.__init__(self, out)
169 self._topdir = topdir
174 def _s_A(self, attrs):
176 if self._key in dict(attrs).get('href',''):
178 ix = [ i for i, attr in enumerate(attrs) if attr[0] == 'href' ][0]
179 anchor = attrs[ix][1]
181 anchor = anchor.split('#')[1]
185 target = self._index[a]
187 target = '%s#%s' % (target, a)
188 elif a.startswith('g'):
193 anchor = os.path.split(anchor)[1]
194 target = self._index[anchor]
197 attrs[ix] = ('href', os.path.join(self._topdir,target))
199 self._removed[anchor] = {}
200 self._collectFor = anchor
203 self.emit_starttag('a',attrs)
206 if self.collecting():
207 self._removed[self._collectFor][stripHTML(self.endCollect())] = None
209 self.emit_endtag('a')
212 return (self._found, self._fixed, self._removed)
214 def fix(self, path, target):
216 data = file(path).read()
217 filt = LinkFixer.LinkFilter(self._index,
219 "../" * (len(os.path.split(path)[0].split("/"))),
223 found, fixed, removed = filt.stats()
226 for anchor, labels in removed.items():
227 for label in labels.keys():
228 self._removed.setdefault((anchor,label),{})[path] = None
231 return (self._files, self._found, self._fixed, self._removed)
233 (opts, args) = getopt.getopt(sys.argv[1:], "vs:")
235 sys.stderr.write("""Usage:
236 fix-links.py [-s skip-dir]... <errrorX.txt> <errorAX.txt>
238 Process the 'errorX.txt' and 'errorAX.txt' files as generated by
239 'linklint': Check all invalid links and try to find the correct
240 target. If a target is found, the link is changed accordingly,
241 otherwise the link is removed.
243 To find anchors, fix-links.py generates a complete index of all
244 anchors defined in any HTML file in the current directory or some
245 subdirectory. The directories named 'skiped-dir' (at any level) will
246 not be scanned for '*.html' files.
250 skipdirs = [ val for opt, val in opts if opt == '-s' ]
251 verbose = ( '-v', '' ) in opts
253 if not os.path.exists(args[0]) and not os.path.exists(args[1]):
254 # No bad links to nothing to do
257 fixer = LinkFixer(skipdirs)
262 if os.path.exists(args[0]):
263 for l in file(args[0]):
265 if l.startswith('/'):
266 target = '#' + os.path.split(l)[1]
267 elif l.startswith(' /') and not l.endswith('/'):
268 sys.stderr.write("%s\n" % l)
269 fixer.fix(l[5:], target)
271 if os.path.exists(args[1]):
272 for l in file(args[1]):
274 if l.startswith('/'):
275 target = l.split('#')[1]
276 elif l.startswith(' /') and not l.endswith('/'):
277 sys.stderr.write("%s\n" % l)
278 fixer.fix(l[5:], target)
280 total, found, fixed, removed = fixer.stats()
283 sys.stderr.write("\nRemoved links:\n")
284 for (anchor, label), files in removed.items():
285 sys.stderr.write("%-36.36s %-48.48s %s\n"
288 " ".join(files.keys())) )
291 Files processed : %5d
292 Links processed : %5d
295 """ % (total, found, fixed, found-fixed))