3 import HTMLParser, re, sys, os.path
5 class ScanLinks(HTMLParser.HTMLParser):
7 def __init__(self, target, base):
8 HTMLParser.HTMLParser.__init__(self)
15 def startCollect(self):
26 def handle_starttag(self,tag,attrs):
27 m = getattr(self,'_s_'+tag.upper(),None)
31 def handle_endtag(self,tag):
32 m = getattr(self,'_e_'+tag.upper(),None)
36 def handle_data(self,data):
40 def handle_charref(self,name):
41 self.handle_data(name)
43 def handle_entityref(self,name):
44 self.handle_data(name)
49 ###########################################################################
51 SCHEME_RE=re.compile("[a-z]+:")
55 url = attrs.get('href')
56 if url and not self.SCHEME_RE.match(url):
57 if '#' in self._target:
58 p = os.path.abspath(os.path.join(self._base,url))
60 p = os.path.abspath(os.path.join(self._base,url.split('#')[0]))
66 self._value = self.endCollect()
68 WS_RE=re.compile("\\s+")
70 def linkName(target,f):
71 scanner = ScanLinks(target,os.path.split(os.path.abspath(f))[0])
72 scanner.feed(file(f).read())
73 return WS_RE.sub(' ',scanner.value().strip())
77 for line in sys.stdin:
78 if line.startswith('<a href='):
79 target = line.split(':')[1]
80 target = target[2:].split('"')[0]
81 elif line.startswith(' <a href='):
82 f = line.split(':')[1]
83 f = f[2:].split('"')[0]
84 line = '%s (%s)</a>\n' % (line[:-5], linkName(target,f))
85 sys.stdout.write(line)