373 lines
11 KiB
Python
Executable file
373 lines
11 KiB
Python
Executable file
#!/usr/bin/env python
|
|
"""
|
|
Pylookup is to lookup entries from python documentation, especially within
|
|
emacs. Pylookup adopts most of ideas from haddoc, lovely toolkit by Martin
|
|
Blais.
|
|
|
|
(usage)
|
|
./pylookup.py -l ljust
|
|
./pylookup.py -u http://docs.python.org
|
|
|
|
"""
|
|
|
|
from __future__ import with_statement
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
from contextlib import closing
|
|
from os.path import abspath, dirname, expanduser, join
|
|
|
|
try:
|
|
import cPickle as pickle
|
|
except:
|
|
import pickle
|
|
|
|
if sys.version_info[0] == 3:
|
|
from html.parser import HTMLParser
|
|
import urllib.parse as urlparse
|
|
import urllib.request as urllib
|
|
else:
|
|
from HTMLParser import HTMLParser
|
|
import urllib2 as urllib
|
|
import urlparse
|
|
import formatter
|
|
|
|
VERBOSE = False
|
|
FORMATS = {
|
|
"Emacs": "{entry}\t({desc})\t[{book}];{url}",
|
|
"Terminal": "{entry}\t({desc})\t[{book}]\n{url}"
|
|
}
|
|
|
|
|
|
def build_book(s, num):
|
|
"""
|
|
Build book identifier from `s`, with `num` links.
|
|
"""
|
|
for matcher, replacement in (("library", "lib"), ("c-api", "api"),
|
|
("reference", "ref"), ("", "etc")):
|
|
if matcher in s:
|
|
return replacement if num == 1 else "%s/%d" % (replacement, num)
|
|
|
|
|
|
def trim(s):
|
|
"""
|
|
Add any globle filtering rules here
|
|
"""
|
|
s = s.replace("Python Enhancement Proposals!", "")
|
|
s = s.replace("PEP ", "PEP-")
|
|
return s
|
|
|
|
|
|
class Element(object):
|
|
def __init__(self, entry, desc, book, url):
|
|
self.book = book
|
|
self.url = url
|
|
self.desc = desc
|
|
self.entry = entry
|
|
|
|
def __format__(self, format_spec):
|
|
return format_spec.format(
|
|
entry=self.entry, desc=self.desc, book=self.book, url=self.url)
|
|
|
|
def match_insensitive(self, key):
|
|
"""
|
|
Match key case insensitive against entry and desc.
|
|
|
|
`key` : Lowercase string.
|
|
"""
|
|
return key in self.entry.lower() or key in self.desc.lower()
|
|
|
|
def match_sensitive(self, key):
|
|
"""
|
|
Match key case sensitive against entry and desc.
|
|
|
|
`key` : Lowercase string.
|
|
"""
|
|
return key in self.entry or key in self.desc
|
|
|
|
def match_in_entry_insensitive(self, key):
|
|
"""
|
|
Match key case insensitive against entry.
|
|
|
|
`key` : Lowercase string.
|
|
"""
|
|
return key in self.entry.lower()
|
|
|
|
def match_in_entry_sensitive(self, key):
|
|
"""
|
|
Match key case sensitive against entry.
|
|
|
|
`key` : Lowercase string.
|
|
"""
|
|
return key in self.entry
|
|
|
|
|
|
def get_matcher(insensitive=True, desc=True):
|
|
"""
|
|
Get `Element.match_*` function.
|
|
|
|
>>> get_matcher(0, 0)
|
|
<unbound method Element.match_in_entry_sensitive>
|
|
>>> get_matcher(1, 0)
|
|
<unbound method Element.match_in_entry_insensitive>
|
|
>>> get_matcher(0, 1)
|
|
<unbound method Element.match_sensitive>
|
|
>>> get_matcher(1, 1)
|
|
<unbound method Element.match_insensitive>
|
|
|
|
"""
|
|
_sensitive = "_insensitive" if insensitive else "_sensitive"
|
|
_in_entry = "" if desc else "_in_entry"
|
|
return getattr(Element, "match{0}{1}".format(_in_entry, _sensitive))
|
|
|
|
|
|
class IndexProcessor(HTMLParser):
|
|
"""
|
|
Extract the index links from a Python HTML documentation index.
|
|
"""
|
|
|
|
def __init__(self, writer, dirn):
|
|
try:
|
|
HTMLParser.__init__(self)
|
|
except TypeError:
|
|
HTMLParser.__init__(self, formatter.NullFormatter())
|
|
self.writer = writer
|
|
self.dirn = dirn
|
|
self.entry = ""
|
|
self.desc = ""
|
|
self.level = 0
|
|
self.one_entry = False
|
|
self.num_of_a = 0
|
|
self.desc_cnt = 0
|
|
self.tag = None
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
self.tag = tag
|
|
attrs = dict(attrs)
|
|
if tag in ['dd', 'dl', 'ul']:
|
|
self.level += 1
|
|
elif tag in ['dt', 'li']:
|
|
self.one_entry = True
|
|
self.num_of_a = 0
|
|
elif tag == 'a':
|
|
if self.one_entry:
|
|
self.url = join(self.dirn, attrs['href'])
|
|
|
|
def handle_endtag(self, tag):
|
|
self.tag = None
|
|
if tag in ['dd', 'dl', 'ul']:
|
|
self.level -= 1
|
|
elif tag in ['dt', 'li']:
|
|
self.one_entry = False
|
|
|
|
def handle_data(self, data):
|
|
if self.tag == 'a':
|
|
global VERBOSE
|
|
if self.one_entry:
|
|
if self.num_of_a == 0:
|
|
self.desc = data
|
|
|
|
if VERBOSE:
|
|
self.desc_cnt += 1
|
|
if self.desc_cnt % 100 == 0:
|
|
sys.stdout.write("%04d %s\r" %
|
|
(self.desc_cnt,
|
|
self.desc.ljust(80)))
|
|
# extract fist element
|
|
# ex) __and__() (in module operator)
|
|
if self.level == 1:
|
|
self.entry = re.sub("\([^)]+\)", "", self.desc)
|
|
|
|
# clean up PEP
|
|
self.entry = trim(self.entry)
|
|
|
|
match = re.search("\([^)]+\)", self.desc)
|
|
if match:
|
|
self.desc = match.group(0)
|
|
|
|
self.desc = trim(re.sub("[()]", "", self.desc))
|
|
|
|
self.num_of_a += 1
|
|
book = build_book(self.url, self.num_of_a)
|
|
e = Element(self.entry, self.desc, book, self.url)
|
|
|
|
self.writer(e)
|
|
|
|
# Overload save_end because of it's strange behaviour.
|
|
def save_end(self):
|
|
pass
|
|
|
|
|
|
def update(db, urls, append=False):
|
|
"""Update database with entries from urls.
|
|
|
|
`db` : filename to database
|
|
`urls` : list of URL
|
|
`append` : append to db
|
|
"""
|
|
mode = "ab" if append else "wb"
|
|
with open(db, mode) as f:
|
|
def writer(e):
|
|
pickle.dump(e, f)
|
|
for url in urls:
|
|
# detech 'file' or 'url' schemes
|
|
parsed = urlparse.urlparse(url)
|
|
if not parsed.scheme or parsed.scheme == "file":
|
|
dst = abspath(expanduser(parsed.path))
|
|
if not os.path.exists(dst):
|
|
print("Error: %s doesn't exist" % dst)
|
|
exit(1)
|
|
url = "file://%s" % dst
|
|
else:
|
|
url = parsed.geturl()
|
|
|
|
potential_urls = []
|
|
if url.endswith('.html'):
|
|
potential_urls.append(url)
|
|
else:
|
|
# guess index URLs
|
|
# for stdlib, this is genindex-all.html
|
|
# for django, numpy, etc. it's genindex.html
|
|
# for flask, requests, it's genindex/
|
|
url = url.rstrip("/")
|
|
potential_urls.append(url + "/genindex-all.html")
|
|
potential_urls.append(url + "/genindex.html")
|
|
potential_urls.append(url + "/genindex/")
|
|
|
|
success = False
|
|
for index_url in potential_urls:
|
|
try:
|
|
print("Wait for a few seconds...")
|
|
print("Fetching index from '%s'" % index_url)
|
|
|
|
index = urllib.urlopen(index_url).read()
|
|
if not issubclass(type(index), str):
|
|
index = index.decode()
|
|
|
|
parser = IndexProcessor(writer, dirname(index_url))
|
|
with closing(parser):
|
|
parser.feed(index)
|
|
|
|
# success, we don't need to try other potential urls
|
|
print("Loaded index from '%s'" % index_url)
|
|
success = True
|
|
break
|
|
except IOError:
|
|
print("Error: fetching file from '%s'" % index_url)
|
|
|
|
if not success:
|
|
print("Failed to load index for input '%s'" % url)
|
|
|
|
|
|
def lookup(db, key, format_spec, out=sys.stdout, insensitive=True, desc=True):
|
|
"""Lookup key from database and print to out.
|
|
|
|
`db` : filename to database
|
|
`key` : key to lookup
|
|
`out` : file-like to write to
|
|
`insensitive` : lookup key case insensitive
|
|
"""
|
|
matcher = get_matcher(insensitive, desc)
|
|
if insensitive:
|
|
key = key.lower()
|
|
with open(db, "rb") as f:
|
|
try:
|
|
while True:
|
|
e = pickle.load(f)
|
|
if matcher(e, key):
|
|
out.write('%s\n' % format(e, format_spec))
|
|
except EOFError:
|
|
pass
|
|
|
|
|
|
def cache(db, out=sys.stdout):
|
|
"""Print unique entries from db to out.
|
|
|
|
`db` : filename to database
|
|
`out` : file-like to write to
|
|
"""
|
|
with open(db, "rb") as f:
|
|
keys = set()
|
|
try:
|
|
while True:
|
|
e = pickle.load(f)
|
|
k = e.entry
|
|
k = re.sub("\([^)]*\)", "", k)
|
|
k = re.sub("\[[^]]*\]", "", k)
|
|
keys.add(k)
|
|
except EOFError:
|
|
pass
|
|
for k in keys:
|
|
out.write('%s\n' % k)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import optparse
|
|
parser = optparse.OptionParser(__doc__.strip())
|
|
parser.add_option(
|
|
"-d", "--db", help="database name", dest="db", default="pylookup.db")
|
|
parser.add_option("-l", "--lookup", help="keyword to search", dest="key")
|
|
parser.add_option(
|
|
"-u",
|
|
"--update",
|
|
help="update url or path",
|
|
action="append",
|
|
type="str",
|
|
dest="url")
|
|
parser.add_option(
|
|
"-c",
|
|
"--cache",
|
|
help="extract keywords, internally used",
|
|
action="store_true",
|
|
default=False,
|
|
dest="cache")
|
|
parser.add_option(
|
|
"-a",
|
|
"--append",
|
|
help="append to the db from multiple sources",
|
|
action="store_true",
|
|
default=False,
|
|
dest="append")
|
|
parser.add_option(
|
|
"-f",
|
|
"--format",
|
|
help="type of output formatting, valid: Emacs, Terminal",
|
|
choices=["Emacs", "Terminal"],
|
|
default="Terminal",
|
|
dest="format")
|
|
parser.add_option(
|
|
"-i",
|
|
"--insensitive",
|
|
default=1,
|
|
choices=['0', '1'],
|
|
help="SEARCH OPTION: insensitive search "
|
|
"(valid: 0, 1; default: %default)")
|
|
parser.add_option(
|
|
"-s",
|
|
"--desc",
|
|
default=1,
|
|
choices=['0', '1'],
|
|
help="SEARCH OPTION: include description field "
|
|
"(valid: 0, 1; default: %default)")
|
|
parser.add_option(
|
|
"-v",
|
|
"--verbose",
|
|
help="verbose",
|
|
action="store_true",
|
|
dest="verbose",
|
|
default=False)
|
|
(opts, args) = parser.parse_args()
|
|
|
|
VERBOSE = opts.verbose
|
|
if opts.url:
|
|
update(opts.db, opts.url, opts.append)
|
|
if opts.cache:
|
|
cache(opts.db)
|
|
if opts.key:
|
|
lookup(
|
|
opts.db,
|
|
opts.key,
|
|
FORMATS[opts.format],
|
|
insensitive=int(opts.insensitive),
|
|
desc=int(opts.desc))
|