Produfce more detailed statistics in fix-links.py
[senf.git] / doclib / fix-links.py
1 #!/usr/bin/python
2
3 import sys,os.path,fnmatch, HTMLParser, getopt, re
4
5 class HTMLFilter(HTMLParser.HTMLParser):
6
7     def __init__(self, out=None):
8         HTMLParser.HTMLParser.__init__(self)
9         self._out = out
10         self._collect = False
11         self._data = ""
12
13     def startCollect(self):
14         self._collect = True
15         self._data = ""
16
17     def endCollect(self):
18         self._collect = False
19         return self._data
20
21     def collecting(self):
22         return self._collect
23
24     def handle_starttag(self,tag,attrs):
25         m = getattr(self,'_s_'+tag.upper(),None)
26         if m:
27             m(attrs)
28         else:
29             self.emit_starttag(tag,attrs)
30
31     def handle_endtag(self,tag):
32         m = getattr(self,'_e_'+tag.upper(),None)
33         if m:
34             m()
35         else:
36             self.emit_endtag(tag)
37
38     def handle_data(self,data):
39         if self._collect:
40             self._data += data
41         if self._out:
42             self._out.write(data)
43
44     def handle_charref(self,name):
45         self.handle_data('&#%s;' % name)
46
47     def handle_entityref(self,name):
48         self.handle_data('&%s;' % name)
49
50     def emit_starttag(self,tag,attrs):
51         self.handle_data('<%s%s>' % (tag, "".join([' %s="%s"' % attr for attr in attrs])))
52
53     def emit_endtag(self,tag):
54         self.handle_data('</%s>' % tag)
55
56
57 class AnchorIndex:
58
59     def __init__(self, skipdirs = ('.svn',)):
60         self._anchors = {}
61         self._skipdirs = skipdirs
62
63     def build(self):
64         sys.stderr.write("Building anchor index ")
65         nf = 0
66         for dirname, subdirs, files in os.walk('.'):
67             for d in self._skipdirs:
68                 if d in subdirs:
69                     subdirs.remove(d)
70             for f in fnmatch.filter(files,'*.html'):
71                 nf += 1
72                 path = os.path.normpath(os.path.join(dirname, f))
73                 self._addAnchor(f, path)
74                 self._extractAnchors(path)
75         sys.stderr.write(" Done.\n")
76         dups = 0
77         for k in self._anchors.keys():
78             if not self._anchors[k]:
79                 dups += 1
80                 del self._anchors[k]
81         sys.stderr.write("%d unique anchors in %d files (%d duplicates)\n"
82                          % (len(self._anchors), nf, dups))
83
84     def _addAnchor(self, anchor, path):
85         if self._anchors.has_key(anchor):
86             self._anchors[anchor] = None
87         else:
88             self._anchors[anchor] = path
89             if len(self._anchors) % 100 == 0:
90                 sys.stderr.write('.')
91
92     def __getitem__(self, key):
93         return self._anchors.get(key)
94
95     class AnchorExtractor(HTMLFilter):
96
97         def __init__(self):
98             HTMLFilter.__init__(self)
99             self._anchors = {}
100
101         def _s_A(self,attrs):
102             attrs = dict(attrs)
103             if attrs.has_key('name'):
104                 self._anchors[attrs['name']] = None
105
106         def anchors(self):
107             return self._anchors.keys()
108
109     def _extractAnchors(self, f):
110         extractor = AnchorIndex.AnchorExtractor()
111         extractor.feed(file(f).read())
112         extractor.close()
113         for anchor in extractor.anchors():
114             self._addAnchor(anchor, f)
115
116 TAG_RE = re.compile("<[^>]*>")
117 REF_RE = re.compile("&[^;]*;")
118
119 def stripHTML(s):
120     s = TAG_RE.sub("",s)
121     s = s.replace("&nbsp;"," ").replace("\n"," ")
122     s = REF_RE.sub("?",s)
123     return s.strip()
124     
125 class LinkFixer:
126
127     def __init__(self, skipdirs=('.svn',)):
128         self._index = AnchorIndex(skipdirs)
129
130     def init(self):
131         self._index.build()
132         self._files = 0
133         self._found = 0
134         self._fixed = 0
135         self._removed = {}
136
137     class LinkFilter(HTMLFilter):
138
139         def __init__(self, index, key, topdir, out):
140             HTMLFilter.__init__(self, out)
141             self._index = index
142             self._key = key
143             self._topdir = topdir
144             self._found = 0
145             self._fixed = 0
146             self._removed = {}
147
148         def _s_A(self, attrs):
149             self._skip_a = False
150             if self._key in dict(attrs).get('href',''):
151                 self._found += 1
152                 ix = [ i for i, attr in enumerate(attrs) if attr[0] == 'href' ][0]
153                 anchor = attrs[ix][1]
154                 if '#' in anchor:
155                     anchor = anchor.split('#')[1]
156                     a = anchor
157                     target = None
158                     while not target:
159                         target = self._index[a]
160                         if target:
161                             target = '%s#%s' % (target, a)
162                         elif a.startswith('g'):
163                             a = a[1:]
164                         else:
165                             break
166                 else:
167                     anchor = os.path.split(anchor)[1]
168                     target = self._index[anchor]
169                 if target:
170                     self._fixed += 1
171                     attrs[ix] = ('href', os.path.join(self._topdir,target))
172                 else:
173                     self._removed[anchor] = {}
174                     self._collectFor = anchor
175                     self.startCollect()
176                     return
177             self.emit_starttag('a',attrs)
178
179         def _e_A(self):
180             if self.collecting():
181                 self._removed[self._collectFor][stripHTML(self.endCollect())] = None
182             else:
183                 self.emit_endtag('a')
184
185         def stats(self):
186             return (self._found, self._fixed, self._removed)
187
188     def fix(self, path, target):
189         self._files += 1
190         data = file(path).read()
191         filt = LinkFixer.LinkFilter(self._index,
192                                     target,
193                                     "../" * (len(os.path.split(path)[0].split("/"))),
194                                     file(path,"w"))
195         filt.feed(data)
196         filt.close()
197         found, fixed, removed = filt.stats()
198         self._found += found
199         self._fixed += fixed
200         for anchor, labels in removed.items():
201             for label in labels.keys():
202                 self._removed.setdefault((anchor,label),{})[path] = None
203
204     def stats(self):
205         return (self._files, self._found, self._fixed, self._removed)
206     
207 (opts, args) = getopt.getopt(sys.argv[1:], "vs:")
208 if len(args) != 2:
209     sys.stderr.write("""Usage:
210         fix-links.py [-s skip-dir]... <errrorX.txt> <errorAX.txt>
211
212 Process the 'errorX.txt' and 'errorAX.txt' files as generated by
213 'linklint': Check all invalid links and try to find the correct
214 target. If a target is found, the link is changed accordingly,
215 otherwise the link is removed.
216
217 To find anchors, fix-links.py generates a complete index of all
218 anchors defined in any HTML file in the current directory or some
219 subdirectory. The directories named 'skiped-dir' (at any level) will
220 not be scanned for '*.html' files.
221 """)
222     sys.exit(1)
223
224 skipdirs = [ val for opt, val in opts if opt == '-s' ]
225 verbose = ( '-v', '' ) in opts
226
227 if not os.path.exists(args[0]) and not os.path.exists(args[1]):
228     # No bad links to nothing to do
229     sys.exit(0)
230
231 fixer = LinkFixer(skipdirs)
232 fixer.init()
233
234 target = None
235
236 if os.path.exists(args[0]):
237     for l in file(args[0]):
238         l = l.rstrip()
239         if l.startswith('/'):
240             target = '#' + os.path.split(l)[1]
241         elif l.startswith('    /') and not l.endswith('/'):
242             sys.stderr.write("%s\n" % l)
243             fixer.fix(l[5:], target)
244
245 if os.path.exists(args[1]):
246     for l in file(args[1]):
247         l = l.rstrip()
248         if l.startswith('/'):
249             target = l.split('#')[1]
250         elif l.startswith('    /') and not l.endswith('/'):
251             sys.stderr.write("%s\n" % l)
252             fixer.fix(l[5:], target)
253
254 total, found, fixed, removed = fixer.stats()
255
256 if verbose:
257     sys.stderr.write("\nRemoved links:\n")
258     for (anchor, label), files in removed.items():
259         sys.stdout.write("%-36.36s %-48.48s %s\n"
260                          % ( anchor,
261                              "(%s)" % label[:46],
262                              " ".join(files.keys())) )
263
264 sys.stderr.write("""
265 Files processed : %5d
266 Links processed : %5d
267 Links fixed     : %5d
268 Links removed   : %5d
269 """ % (total, found, fixed, found-fixed))