doclib/fix-links.py

   1 #!/usr/bin/python
   2
   3 import sys,os.path,fnmatch, HTMLParser, getopt, re
   4
   5 class HTMLFilter(HTMLParser.HTMLParser):
   6
   7     def __init__(self, out=None):
   8         HTMLParser.HTMLParser.__init__(self)
   9         self._out = out
  10         self._collect = False
  11         self._data = ""
  12
  13     def startCollect(self):
  14         self._collect = True
  15         self._data = ""
  16
  17     def endCollect(self):
  18         self._collect = False
  19         return self._data
  20
  21     def collecting(self):
  22         return self._collect
  23
  24     def handle_starttag(self,tag,attrs):
  25         m = getattr(self,'_s_'+tag.upper(),None)
  26         if m:
  27             m(attrs)
  28         else:
  29             self.emit_starttag(tag,attrs)
  30
  31     def handle_endtag(self,tag):
  32         m = getattr(self,'_e_'+tag.upper(),None)
  33         if m:
  34             m()
  35         else:
  36             self.emit_endtag(tag)
  37
  38     def handle_data(self,data):
  39         if self._collect:
  40             self._data += data
  41         if self._out:
  42             self._out.write(data)
  43
  44     def handle_charref(self,name):
  45         self.handle_data('&#%s;' % name)
  46
  47     def handle_entityref(self,name):
  48         self.handle_data('&%s;' % name)
  49
  50     def emit_starttag(self,tag,attrs):
  51         self.handle_data('<%s%s>' % (tag, "".join([' %s="%s"' % attr for attr in attrs])))
  52
  53     def emit_endtag(self,tag):
  54         self.handle_data('</%s>' % tag)
  55
  56
  57 class AnchorIndex:
  58
  59     def __init__(self, skipdirs = ('.svn',)):
  60         self._anchors = {}
  61         self._skipdirs = skipdirs
  62
  63     def build(self):
  64         sys.stderr.write("Building anchor index ")
  65         nf = 0
  66         for dirname, subdirs, files in os.walk('.'):
  67             for d in self._skipdirs:
  68                 if d in subdirs:
  69                     subdirs.remove(d)
  70             for f in fnmatch.filter(files,'*.html'):
  71                 nf += 1
  72                 path = os.path.normpath(os.path.join(dirname, f))
  73                 self._addAnchor(f, path)
  74                 self._extractAnchors(path)
  75         sys.stderr.write(" Done.\n")
  76         dups = 0
  77         for k in self._anchors.keys():
  78             if not self._anchors[k]:
  79                 dups += 1
  80                 del self._anchors[k]
  81         sys.stderr.write("%d unique anchors in %d files (%d duplicates)\n"
  82                          % (len(self._anchors), nf, dups))
  83
  84     def _addAnchor(self, anchor, path):
  85         if self._anchors.has_key(anchor):
  86             self._anchors[anchor] = None
  87         else:
  88             self._anchors[anchor] = path
  89             if len(self._anchors) % 100 == 0:
  90                 sys.stderr.write('.')
  91
  92     def __getitem__(self, key):
  93         return self._anchors.get(key)
  94
  95     class AnchorExtractor(HTMLFilter):
  96
  97         def __init__(self):
  98             HTMLFilter.__init__(self)
  99             self._anchors = {}
 100
 101         def _s_A(self,attrs):
 102             attrs = dict(attrs)
 103             if attrs.has_key('name'):
 104                 self._anchors[attrs['name']] = None
 105
 106         def anchors(self):
 107             return self._anchors.keys()
 108
 109     def _extractAnchors(self, f):
 110         extractor = AnchorIndex.AnchorExtractor()
 111         extractor.feed(file(f).read())
 112         extractor.close()
 113         for anchor in extractor.anchors():
 114             self._addAnchor(anchor, f)
 115
 116 TAG_RE = re.compile("<[^>]*>")
 117 REF_RE = re.compile("&[^;]*;")
 118
 119 def stripHTML(s):
 120     s = TAG_RE.sub("",s)
 121     s = s.replace("&nbsp;"," ").replace("\n"," ")
 122     s = REF_RE.sub("?",s)
 123     return s.strip()
 124
 125 class LinkFixer:
 126
 127     def __init__(self, skipdirs=('.svn',)):
 128         self._index = AnchorIndex(skipdirs)
 129
 130     def init(self):
 131         self._index.build()
 132         self._files = 0
 133         self._found = 0
 134         self._fixed = 0
 135         self._removed = {}
 136
 137     class LinkFilter(HTMLFilter):
 138
 139         def __init__(self, index, key, topdir, out):
 140             HTMLFilter.__init__(self, out)
 141             self._index = index
 142             self._key = key
 143             self._topdir = topdir
 144             self._found = 0
 145             self._fixed = 0
 146             self._removed = {}
 147
 148         def _s_A(self, attrs):
 149             self._skip_a = False
 150             if self._key in dict(attrs).get('href',''):
 151                 self._found += 1
 152                 ix = [ i for i, attr in enumerate(attrs) if attr[0] == 'href' ][0]
 153                 anchor = attrs[ix][1]
 154                 if '#' in anchor:
 155                     anchor = anchor.split('#')[1]
 156                     a = anchor
 157                     target = None
 158                     while not target:
 159                         target = self._index[a]
 160                         if target:
 161                             target = '%s#%s' % (target, a)
 162                         elif a.startswith('g'):
 163                             a = a[1:]
 164                         else:
 165                             break
 166                 else:
 167                     anchor = os.path.split(anchor)[1]
 168                     target = self._index[anchor]
 169                 if target:
 170                     self._fixed += 1
 171                     attrs[ix] = ('href', os.path.join(self._topdir,target))
 172                 else:
 173                     self._removed[anchor] = {}
 174                     self._collectFor = anchor
 175                     self.startCollect()
 176                     return
 177             self.emit_starttag('a',attrs)
 178
 179         def _e_A(self):
 180             if self.collecting():
 181                 self._removed[self._collectFor][stripHTML(self.endCollect())] = None
 182             else:
 183                 self.emit_endtag('a')
 184
 185         def stats(self):
 186             return (self._found, self._fixed, self._removed)
 187
 188     def fix(self, path, target):
 189         self._files += 1
 190         data = file(path).read()
 191         filt = LinkFixer.LinkFilter(self._index,
 192                                     target,
 193                                     "../" * (len(os.path.split(path)[0].split("/"))),
 194                                     file(path,"w"))
 195         filt.feed(data)
 196         filt.close()
 197         found, fixed, removed = filt.stats()
 198         self._found += found
 199         self._fixed += fixed
 200         for anchor, labels in removed.items():
 201             for label in labels.keys():
 202                 self._removed.setdefault((anchor,label),{})[path] = None
 203
 204     def stats(self):
 205         return (self._files, self._found, self._fixed, self._removed)
 206
 207 (opts, args) = getopt.getopt(sys.argv[1:], "vs:")
 208 if len(args) != 2:
 209     sys.stderr.write("""Usage:
 210         fix-links.py [-s skip-dir]... <errrorX.txt> <errorAX.txt>
 211
 212 Process the 'errorX.txt' and 'errorAX.txt' files as generated by
 213 'linklint': Check all invalid links and try to find the correct
 214 target. If a target is found, the link is changed accordingly,
 215 otherwise the link is removed.
 216
 217 To find anchors, fix-links.py generates a complete index of all
 218 anchors defined in any HTML file in the current directory or some
 219 subdirectory. The directories named 'skiped-dir' (at any level) will
 220 not be scanned for '*.html' files.
 221 """)
 222     sys.exit(1)
 223
 224 skipdirs = [ val for opt, val in opts if opt == '-s' ]
 225 verbose = ( '-v', '' ) in opts
 226
 227 if not os.path.exists(args[0]) and not os.path.exists(args[1]):
 228     # No bad links to nothing to do
 229     sys.exit(0)
 230
 231 fixer = LinkFixer(skipdirs)
 232 fixer.init()
 233
 234 target = None
 235
 236 if os.path.exists(args[0]):
 237     for l in file(args[0]):
 238         l = l.rstrip()
 239         if l.startswith('/'):
 240             target = '#' + os.path.split(l)[1]
 241         elif l.startswith('    /') and not l.endswith('/'):
 242             sys.stderr.write("%s\n" % l)
 243             fixer.fix(l[5:], target)
 244
 245 if os.path.exists(args[1]):
 246     for l in file(args[1]):
 247         l = l.rstrip()
 248         if l.startswith('/'):
 249             target = l.split('#')[1]
 250         elif l.startswith('    /') and not l.endswith('/'):
 251             sys.stderr.write("%s\n" % l)
 252             fixer.fix(l[5:], target)
 253
 254 total, found, fixed, removed = fixer.stats()
 255
 256 if verbose:
 257     sys.stderr.write("\nRemoved links:\n")
 258     for (anchor, label), files in removed.items():
 259         sys.stdout.write("%-36.36s %-48.48s %s\n"
 260                          % ( anchor,
 261                              "(%s)" % label[:46],
 262                              " ".join(files.keys())) )
 263
 264 sys.stderr.write("""
 265 Files processed : %5d
 266 Links processed : %5d
 267 Links fixed     : %5d
 268 Links removed   : %5d
 269 """ % (total, found, fixed, found-fixed))