X-Git-Url: http://g0dil.de/git?a=blobdiff_plain;f=doclib%2Ffix-links.py;h=3958e3fa0f946b066b9bfbbf1ca5d50980caacde;hb=HEAD;hp=d0df7f6a16ef1f3e7acd1ccabdccae461cc40f84;hpb=e84dd6c52a07fc9e283cbd72c7616f3523920387;p=senf.git

diff --git a/doclib/fix-links.py b/doclib/fix-links.py
index d0df7f6..3958e3f 100644
--- a/doclib/fix-links.py
+++ b/doclib/fix-links.py
@@ -1,6 +1,32 @@
 #!/usr/bin/python
-
-import sys,os.path,fnmatch, HTMLParser, getopt
+#
+# This tool will hack the doxygen generated documentation to fix link
+# errors produced by doxygen.
+#
+# This works because most anchors doxygen generates are unique 32 char
+# hash values. To speed up the operation, the tool will not check all
+# the files itself but will let 'linklint' do the grunt
+# work. fix-links.py reads the 'errorX.txt' and 'errorAX.txt' files
+# generated by linklint. These files list links to missing files
+# (errorX.html) and links to missing anchors
+# (errorAX.html). fix-links.py works in the following way:
+#
+# - Build a complete index of all unique anchors found in any html
+#   file. The index will only include *unique* anchors. Anchors found
+#   multiple times are removed from the index
+#
+# - The index is extended to include all unique names of html files
+#
+# - Scn the linklint result and check the bad links against the
+#   index. If the file or anchor is found in the index, an accoringly
+#   corrected link is generated otherwise the link is removed.
+#
+# One additional twak is, that fix-links.py will successively remove
+# initial 'g' charachters from anchors until the link is found in the
+# index. Doxygen seems to create links with the wrong number of 'g'
+# charachters in front sometimes.
+
+import sys,os.path,fnmatch, HTMLParser, getopt, re, codecs
 
 class HTMLFilter(HTMLParser.HTMLParser):
 
@@ -42,10 +68,10 @@ class HTMLFilter(HTMLParser.HTMLParser):
             self._out.write(data)
 
     def handle_charref(self,name):
-        self.handle_data(name)
+        self.handle_data('&#%s;' % name)
 
     def handle_entityref(self,name):
-        self.handle_data(name)
+        self.handle_data('&%s;' % name)
 
     def emit_starttag(self,tag,attrs):
         self.handle_data('<%s%s>' % (tag, "".join([' %s="%s"' % attr for attr in attrs])))
@@ -113,6 +139,14 @@ class AnchorIndex:
         for anchor in extractor.anchors():
             self._addAnchor(anchor, f)
 
+TAG_RE = re.compile("<[^>]*>")
+REF_RE = re.compile("&[^;]*;")
+
+def stripHTML(s):
+    s = TAG_RE.sub("",s)
+    s = s.replace("&nbsp;"," ").replace("\n"," ")
+    s = REF_RE.sub("?",s)
+    return s.strip()
 
 class LinkFixer:
 
@@ -121,76 +155,71 @@ class LinkFixer:
 
     def init(self):
         self._index.build()
-        self._files = 0
-        self._found = 0
-        self._fixed = 0
 
     class LinkFilter(HTMLFilter):
 
-        def __init__(self, index, key, topdir, out):
+        def __init__(self, index, topdir, out):
             HTMLFilter.__init__(self, out)
             self._index = index
-            self._key = key
             self._topdir = topdir
-            self._skip_a = False
-            self._found = 0
-            self._fixed = 0
 
-        def _s_A(self, attrs):
-            self._skip_a = False
-            if self._key in dict(attrs).get('href',''):
-                self._found += 1
-                ix = [ i for i, attr in enumerate(attrs) if attr[0] == 'href' ][0]
-                target = attrs[ix][1]
-                if '#' in target:
-                    anchor = target.split('#')[1]
-                    target = self._index[anchor]
-                    if target:
-                        target = '%s#%s' % (target, anchor)
-                else:
-                    target = self._index[os.path.split(target)[1]]
+        def _check(self, tag, linkAttr, attrs):
+            ix = target = None
+            for i,(k,v) in enumerate(attrs):
+                if k == linkAttr:
+                    ix, target = i, v
+                    break
+            if target:
+                fix = False
+                tdir = anchor = None
+                if '#' in target : target, anchor = target.rsplit('#',1)
+                if '/' in target : tdir, target = target.rsplit('/', 1)
+                newTarget = None
+                if anchor:
+                    newTarget = self.anchorLookup(anchor)
+                if newTarget is None:
+                    newTarget = self.fileLookup(target)
+                    if newTarget and anchor:
+                        newTarget = '%s#%s' % (newTarget, anchor)
+                if newTarget:
+                    attrs[ix] = (attrs[ix][0], '/'.join((self._topdir, newTarget)))
+            self.emit_starttag(tag, attrs)
+
+        def anchorLookup(self,anchor):
+            target = None
+            while not target:
+                target = self._index[anchor]
                 if target:
-                    self._fixed += 1
-                    attrs[ix] = ('href', os.path.join(self._topdir,target))
+                    target = '%s#%s' % (target, anchor)
+                elif anchor.startswith('g'):
+                    anchor = anchor[1:]
                 else:
-                    self._skip_a = True
-                    return
-            self.emit_starttag('a',attrs)
-
-        def _e_A(self):
-            if self._skip_a:
-                self._skip_a = False
-            else:
-                self.emit_endtag('a')
-
-        def stats(self):
-            return (self._found, self._fixed)
-
-    def fix(self, path, target):
-        self._files += 1
-        data = file(path).read()
+                    break
+            return target
+
+        def fileLookup(self,target):
+            return self._index[target]
+
+        def _s_A(self, attrs):
+            self._check('a', 'href', attrs)
+
+        def _s_AREA(self, attrs):
+            self._check('area', 'href', attrs)
+
+    def fix(self, path):
+        data = codecs.open(path, "r", "utf-8").read()
         filt = LinkFixer.LinkFilter(self._index,
-                                    target,
-                                    "../" * (len(os.path.split(path)[0].split("/"))),
-                                    file(path,"w"))
+                                    ("../" * (len(os.path.split(path)[0].split("/"))))[:-1],
+                                    codecs.open(path, "w", "utf-8") )
         filt.feed(data)
         filt.close()
-        self._found += filt.stats()[0]
-        self._fixed += filt.stats()[1]
-
-    def stats(self):
-        return (self._files, self._found, self._fixed)
-    
 
-(opts, args) = getopt.getopt(sys.argv[1:], "s:")
-if len(args) != 2:
-    sys.stderr.write("""Usage:
-	fix-links.py [-s skip-dir]... <errrorX.txt> <errorAX.txt>
+(opts, args) = getopt.getopt(sys.argv[1:], "vs:")
+if len(args) != 0:
+    sys.stderr.write("""Usage: fix-links.py [-s skip-dir]...
 
-Process the 'errorX.txt' and 'errorAX.txt' files as generated by
-'linklint': Check all invalid links and try to find the correct
-target. If a target is found, the link is changed accordingly,
-otherwise the link is removed.
+Check all links and try to find the correct target. If a target is
+found, the link is changed accordingly, otherwise the link is removed.
 
 To find anchors, fix-links.py generates a complete index of all
 anchors defined in any HTML file in the current directory or some
@@ -200,32 +229,16 @@ not be scanned for '*.html' files.
     sys.exit(1)
 
 skipdirs = [ val for opt, val in opts if opt == '-s' ]
+verbose = ( '-v', '' ) in opts
 
 fixer = LinkFixer(skipdirs)
 fixer.init()
 
-target = None
-for l in file(args[0]):
-    l = l.rstrip()
-    if l.startswith('/'):
-        target = '#' + os.path.split(l)[1]
-    elif l.startswith('    /') and not l.endswith('/'):
-        sys.stderr.write("%s\n" % l)
-        fixer.fix(l[5:], target)
-
-for l in file(args[1]):
-    l = l.rstrip()
-    if l.startswith('/'):
-        target = l.split('#')[1]
-    elif l.startswith('    /') and not l.endswith('/'):
-        sys.stderr.write("%s\n" % l)
-        fixer.fix(l[5:], target)
-
-files, found, fixed = fixer.stats()
-
-sys.stderr.write("""
-Files processed : %5d
-Links processed : %5d
-Links fixed     : %5d
-Links removed   : %5d
-""" % (files, found, fixed, found-fixed))
+for dirname, subdirs, files in os.walk('.'):
+    for d in skipdirs:
+        if d in subdirs:
+            subdirs.remove(d)
+    for f in fnmatch.filter(files,'*.html'):
+        path = os.path.normpath(os.path.join(dirname, f))
+        print path
+        fixer.fix(path)