3 import sys,os.path,fnmatch, HTMLParser, getopt, re
5 class HTMLFilter(HTMLParser.HTMLParser):
7 def __init__(self, out=None):
8 HTMLParser.HTMLParser.__init__(self)
13 def startCollect(self):
24 def handle_starttag(self,tag,attrs):
25 m = getattr(self,'_s_'+tag.upper(),None)
29 self.emit_starttag(tag,attrs)
31 def handle_endtag(self,tag):
32 m = getattr(self,'_e_'+tag.upper(),None)
38 def handle_data(self,data):
44 def handle_charref(self,name):
45 self.handle_data('&#%s;' % name)
47 def handle_entityref(self,name):
48 self.handle_data('&%s;' % name)
50 def emit_starttag(self,tag,attrs):
51 self.handle_data('<%s%s>' % (tag, "".join([' %s="%s"' % attr for attr in attrs])))
53 def emit_endtag(self,tag):
54 self.handle_data('</%s>' % tag)
59 def __init__(self, skipdirs = ('.svn',)):
61 self._skipdirs = skipdirs
64 sys.stderr.write("Building anchor index ")
66 for dirname, subdirs, files in os.walk('.'):
67 for d in self._skipdirs:
70 for f in fnmatch.filter(files,'*.html'):
72 path = os.path.normpath(os.path.join(dirname, f))
73 self._addAnchor(f, path)
74 self._extractAnchors(path)
75 sys.stderr.write(" Done.\n")
77 for k in self._anchors.keys():
78 if not self._anchors[k]:
81 sys.stderr.write("%d unique anchors in %d files (%d duplicates)\n"
82 % (len(self._anchors), nf, dups))
84 def _addAnchor(self, anchor, path):
85 if self._anchors.has_key(anchor):
86 self._anchors[anchor] = None
88 self._anchors[anchor] = path
89 if len(self._anchors) % 100 == 0:
92 def __getitem__(self, key):
93 return self._anchors.get(key)
95 class AnchorExtractor(HTMLFilter):
98 HTMLFilter.__init__(self)
101 def _s_A(self,attrs):
103 if attrs.has_key('name'):
104 self._anchors[attrs['name']] = None
107 return self._anchors.keys()
109 def _extractAnchors(self, f):
110 extractor = AnchorIndex.AnchorExtractor()
111 extractor.feed(file(f).read())
113 for anchor in extractor.anchors():
114 self._addAnchor(anchor, f)
116 TAG_RE = re.compile("<[^>]*>")
117 REF_RE = re.compile("&[^;]*;")
121 s = s.replace(" "," ").replace("\n"," ")
122 s = REF_RE.sub("?",s)
127 def __init__(self, skipdirs=('.svn',)):
128 self._index = AnchorIndex(skipdirs)
137 class LinkFilter(HTMLFilter):
139 def __init__(self, index, key, topdir, out):
140 HTMLFilter.__init__(self, out)
143 self._topdir = topdir
148 def _s_A(self, attrs):
150 if self._key in dict(attrs).get('href',''):
152 ix = [ i for i, attr in enumerate(attrs) if attr[0] == 'href' ][0]
153 anchor = attrs[ix][1]
155 anchor = anchor.split('#')[1]
159 target = self._index[a]
161 target = '%s#%s' % (target, a)
162 elif a.startswith('g'):
167 anchor = os.path.split(anchor)[1]
168 target = self._index[anchor]
171 attrs[ix] = ('href', os.path.join(self._topdir,target))
173 self._removed[anchor] = {}
174 self._collectFor = anchor
177 self.emit_starttag('a',attrs)
180 if self.collecting():
181 self._removed[self._collectFor][stripHTML(self.endCollect())] = None
183 self.emit_endtag('a')
186 return (self._found, self._fixed, self._removed)
188 def fix(self, path, target):
190 data = file(path).read()
191 filt = LinkFixer.LinkFilter(self._index,
193 "../" * (len(os.path.split(path)[0].split("/"))),
197 found, fixed, removed = filt.stats()
200 for anchor, labels in removed.items():
201 for label in labels.keys():
202 self._removed.setdefault((anchor,label),{})[path] = None
205 return (self._files, self._found, self._fixed, self._removed)
207 (opts, args) = getopt.getopt(sys.argv[1:], "vs:")
209 sys.stderr.write("""Usage:
210 fix-links.py [-s skip-dir]... <errrorX.txt> <errorAX.txt>
212 Process the 'errorX.txt' and 'errorAX.txt' files as generated by
213 'linklint': Check all invalid links and try to find the correct
214 target. If a target is found, the link is changed accordingly,
215 otherwise the link is removed.
217 To find anchors, fix-links.py generates a complete index of all
218 anchors defined in any HTML file in the current directory or some
219 subdirectory. The directories named 'skiped-dir' (at any level) will
220 not be scanned for '*.html' files.
224 skipdirs = [ val for opt, val in opts if opt == '-s' ]
225 verbose = ( '-v', '' ) in opts
227 if not os.path.exists(args[0]) and not os.path.exists(args[1]):
228 # No bad links to nothing to do
231 fixer = LinkFixer(skipdirs)
236 if os.path.exists(args[0]):
237 for l in file(args[0]):
239 if l.startswith('/'):
240 target = '#' + os.path.split(l)[1]
241 elif l.startswith(' /') and not l.endswith('/'):
242 sys.stderr.write("%s\n" % l)
243 fixer.fix(l[5:], target)
245 if os.path.exists(args[1]):
246 for l in file(args[1]):
248 if l.startswith('/'):
249 target = l.split('#')[1]
250 elif l.startswith(' /') and not l.endswith('/'):
251 sys.stderr.write("%s\n" % l)
252 fixer.fix(l[5:], target)
254 total, found, fixed, removed = fixer.stats()
257 sys.stderr.write("\nRemoved links:\n")
258 for (anchor, label), files in removed.items():
259 sys.stdout.write("%-36.36s %-48.48s %s\n"
262 " ".join(files.keys())) )
265 Files processed : %5d
266 Links processed : %5d
269 """ % (total, found, fixed, found-fixed))