3 import sys,os.path,fnmatch, HTMLParser, getopt
5 class HTMLFilter(HTMLParser.HTMLParser):
7 def __init__(self, out=None):
8 HTMLParser.HTMLParser.__init__(self)
13 def startCollect(self):
24 def handle_starttag(self,tag,attrs):
25 m = getattr(self,'_s_'+tag.upper(),None)
29 self.emit_starttag(tag,attrs)
31 def handle_endtag(self,tag):
32 m = getattr(self,'_e_'+tag.upper(),None)
38 def handle_data(self,data):
44 def handle_charref(self,name):
45 self.handle_data(name)
47 def handle_entityref(self,name):
48 self.handle_data(name)
50 def emit_starttag(self,tag,attrs):
51 self.handle_data('<%s%s>' % (tag, "".join([' %s="%s"' % attr for attr in attrs])))
53 def emit_endtag(self,tag):
54 self.handle_data('</%s>' % tag)
59 def __init__(self, skipdirs = ('.svn',)):
61 self._skipdirs = skipdirs
64 sys.stderr.write("Building anchor index ")
66 for dirname, subdirs, files in os.walk('.'):
67 for d in self._skipdirs:
70 for f in fnmatch.filter(files,'*.html'):
72 path = os.path.normpath(os.path.join(dirname, f))
73 self._addAnchor(f, path)
74 self._extractAnchors(path)
75 sys.stderr.write(" Done.\n")
77 for k in self._anchors.keys():
78 if not self._anchors[k]:
81 sys.stderr.write("%d unique anchors in %d files (%d duplicates)\n"
82 % (len(self._anchors), nf, dups))
84 def _addAnchor(self, anchor, path):
85 if self._anchors.has_key(anchor):
86 self._anchors[anchor] = None
88 self._anchors[anchor] = path
89 if len(self._anchors) % 100 == 0:
92 def __getitem__(self, key):
93 return self._anchors.get(key)
95 class AnchorExtractor(HTMLFilter):
98 HTMLFilter.__init__(self)
101 def _s_A(self,attrs):
103 if attrs.has_key('name'):
104 self._anchors[attrs['name']] = None
107 return self._anchors.keys()
109 def _extractAnchors(self, f):
110 extractor = AnchorIndex.AnchorExtractor()
111 extractor.feed(file(f).read())
113 for anchor in extractor.anchors():
114 self._addAnchor(anchor, f)
119 def __init__(self, skipdirs=('.svn',)):
120 self._index = AnchorIndex(skipdirs)
128 class LinkFilter(HTMLFilter):
130 def __init__(self, index, key, topdir, out):
131 HTMLFilter.__init__(self, out)
134 self._topdir = topdir
139 def _s_A(self, attrs):
141 if self._key in dict(attrs).get('href',''):
143 ix = [ i for i, attr in enumerate(attrs) if attr[0] == 'href' ][0]
144 target = attrs[ix][1]
146 anchor = target.split('#')[1]
147 target = self._index[anchor]
149 target = '%s#%s' % (target, anchor)
151 target = self._index[os.path.split(target)[1]]
154 attrs[ix] = ('href', os.path.join(self._topdir,target))
158 self.emit_starttag('a',attrs)
164 self.emit_endtag('a')
167 return (self._found, self._fixed)
169 def fix(self, path, target):
171 data = file(path).read()
172 filt = LinkFixer.LinkFilter(self._index,
174 "../" * (len(os.path.split(path)[0].split("/"))),
178 self._found += filt.stats()[0]
179 self._fixed += filt.stats()[1]
182 return (self._files, self._found, self._fixed)
185 (opts, args) = getopt.getopt(sys.argv[1:], "s:")
187 sys.stderr.write("""Usage:
188 fix-links.py [-s skip-dir]... <errrorX.txt> <errorAX.txt>
190 Process the 'errorX.txt' and 'errorAX.txt' files as generated by
191 'linklint': Check all invalid links and try to find the correct
192 target. If a target is found, the link is changed accordingly,
193 otherwise the link is removed.
195 To find anchors, fix-links.py generates a complete index of all
196 anchors defined in any HTML file in the current directory or some
197 subdirectory. The directories named 'skiped-dir' (at any level) will
198 not be scanned for '*.html' files.
202 skipdirs = [ val for opt, val in opts if opt == '-s' ]
204 fixer = LinkFixer(skipdirs)
208 for l in file(args[0]):
210 if l.startswith('/'):
211 target = '#' + os.path.split(l)[1]
212 elif l.startswith(' /') and not l.endswith('/'):
213 sys.stderr.write("%s\n" % l)
214 fixer.fix(l[5:], target)
216 for l in file(args[1]):
218 if l.startswith('/'):
219 target = l.split('#')[1]
220 elif l.startswith(' /') and not l.endswith('/'):
221 sys.stderr.write("%s\n" % l)
222 fixer.fix(l[5:], target)
224 files, found, fixed = fixer.stats()
227 Files processed : %5d
228 Links processed : %5d
231 """ % (files, found, fixed, found-fixed))