spacemacs/layers/+lang/python/local/pylookup/pylookup.py
2018-01-17 23:23:59 -05:00

374 lines
11 KiB
Python
Executable file

#!/usr/bin/env python
"""
Pylookup is to lookup entries from python documentation, especially within
emacs. Pylookup adopts most of ideas from haddoc, lovely toolkit by Martin
Blais.
(usage)
./pylookup.py -l ljust
./pylookup.py -u http://docs.python.org
"""
from __future__ import with_statement
import os
import re
import sys
from contextlib import closing
from os.path import abspath, dirname, expanduser, join
try:
import cPickle as pickle
except:
import pickle
if sys.version_info[0] == 3:
from html.parser import HTMLParser
import urllib.parse as urlparse
import urllib.request as urllib
else:
from HTMLParser import HTMLParser
import urllib2 as urllib
import urlparse
import formatter
VERBOSE = False
FORMATS = {
"Emacs": "{entry}\t({desc})\t[{book}];{url}",
"Terminal": "{entry}\t({desc})\t[{book}]\n{url}"
}
def build_book(s, num):
"""
Build book identifier from `s`, with `num` links.
"""
for matcher, replacement in (("library", "lib"), ("c-api", "api"),
("reference", "ref"), ("", "etc")):
if matcher in s:
return replacement if num == 1 else "%s/%d" % (replacement, num)
def trim(s):
"""
Add any globle filtering rules here
"""
s = s.replace("Python Enhancement Proposals!", "")
s = s.replace("PEP ", "PEP-")
return s
class Element(object):
def __init__(self, entry, desc, book, url):
self.book = book
self.url = url
self.desc = desc
self.entry = entry
def __format__(self, format_spec):
return format_spec.format(
entry=self.entry, desc=self.desc, book=self.book, url=self.url)
def match_insensitive(self, key):
"""
Match key case insensitive against entry and desc.
`key` : Lowercase string.
"""
return key in self.entry.lower() or key in self.desc.lower()
def match_sensitive(self, key):
"""
Match key case sensitive against entry and desc.
`key` : Lowercase string.
"""
return key in self.entry or key in self.desc
def match_in_entry_insensitive(self, key):
"""
Match key case insensitive against entry.
`key` : Lowercase string.
"""
return key in self.entry.lower()
def match_in_entry_sensitive(self, key):
"""
Match key case sensitive against entry.
`key` : Lowercase string.
"""
return key in self.entry
def get_matcher(insensitive=True, desc=True):
"""
Get `Element.match_*` function.
>>> get_matcher(0, 0)
<unbound method Element.match_in_entry_sensitive>
>>> get_matcher(1, 0)
<unbound method Element.match_in_entry_insensitive>
>>> get_matcher(0, 1)
<unbound method Element.match_sensitive>
>>> get_matcher(1, 1)
<unbound method Element.match_insensitive>
"""
_sensitive = "_insensitive" if insensitive else "_sensitive"
_in_entry = "" if desc else "_in_entry"
return getattr(Element, "match{0}{1}".format(_in_entry, _sensitive))
class IndexProcessor(HTMLParser):
"""
Extract the index links from a Python HTML documentation index.
"""
def __init__(self, writer, dirn):
try:
HTMLParser.__init__(self)
except TypeError:
HTMLParser.__init__(self, formatter.NullFormatter())
self.writer = writer
self.dirn = dirn
self.entry = ""
self.desc = ""
self.level = 0
self.one_entry = False
self.num_of_a = 0
self.desc_cnt = 0
self.tag = None
def handle_starttag(self, tag, attrs):
self.tag = tag
attrs = dict(attrs)
if tag in ['dd', 'dl', 'ul']:
self.level += 1
elif tag in ['dt', 'li']:
self.one_entry = True
self.num_of_a = 0
elif tag == 'a':
if self.one_entry:
self.url = join(self.dirn, attrs['href'])
def handle_endtag(self, tag):
self.tag = None
if tag in ['dd', 'dl', 'ul']:
self.level -= 1
elif tag in ['dt', 'li']:
self.one_entry = False
def handle_data(self, data):
if self.tag == 'a':
global VERBOSE
if self.one_entry:
if self.num_of_a == 0:
self.desc = data
if VERBOSE:
self.desc_cnt += 1
if self.desc_cnt % 100 == 0:
sys.stdout.write("%04d %s\r" %
(self.desc_cnt,
self.desc.ljust(80)))
# extract fist element
# ex) __and__() (in module operator)
if self.level == 1:
self.entry = re.sub("\([^)]+\)", "", self.desc)
# clean up PEP
self.entry = trim(self.entry)
match = re.search("\([^)]+\)", self.desc)
if match:
self.desc = match.group(0)
self.desc = trim(re.sub("[()]", "", self.desc))
self.num_of_a += 1
book = build_book(self.url, self.num_of_a)
e = Element(self.entry, self.desc, book, self.url)
self.writer(e)
# Overload save_end because of it's strange behaviour.
def save_end(self):
pass
def update(db, urls, append=False):
"""Update database with entries from urls.
`db` : filename to database
`urls` : list of URL
`append` : append to db
"""
mode = "ab" if append else "wb"
with open(db, mode) as f:
def writer(e):
pickle.dump(e, f)
for url in urls:
# detech 'file' or 'url' schemes
parsed = urlparse.urlparse(url)
if not parsed.scheme or parsed.scheme == "file":
dst = abspath(expanduser(parsed.path))
if not os.path.exists(dst):
print("Error: %s doesn't exist" % dst)
exit(1)
url = "file://%s" % dst
else:
url = parsed.geturl()
potential_urls = []
if url.endswith('.html'):
potential_urls.append(url)
else:
# guess index URLs
# for stdlib, this is genindex-all.html
# for django, numpy, etc. it's genindex.html
# for flask, requests, it's genindex/
url = url.rstrip("/")
potential_urls.append(url + "/genindex-all.html")
potential_urls.append(url + "/genindex.html")
potential_urls.append(url + "/genindex/")
success = False
for index_url in potential_urls:
try:
print("Wait for a few seconds...")
print("Fetching index from '%s'" % index_url)
index = urllib.urlopen(index_url).read()
if not issubclass(type(index), str):
index = index.decode()
parser = IndexProcessor(writer, dirname(index_url))
with closing(parser):
parser.feed(index)
# success, we don't need to try other potential urls
print("Loaded index from '%s'" % index_url)
success = True
break
except IOError:
print("Error: fetching file from '%s'" % index_url)
if not success:
print("Failed to load index for input '%s'" % url)
def lookup(db, key, format_spec, out=sys.stdout, insensitive=True, desc=True):
"""Lookup key from database and print to out.
`db` : filename to database
`key` : key to lookup
`out` : file-like to write to
`insensitive` : lookup key case insensitive
"""
matcher = get_matcher(insensitive, desc)
if insensitive:
key = key.lower()
with open(db, "rb") as f:
try:
while True:
e = pickle.load(f)
if matcher(e, key):
out.write('%s\n' % format(e, format_spec))
except EOFError:
pass
def cache(db, out=sys.stdout):
"""Print unique entries from db to out.
`db` : filename to database
`out` : file-like to write to
"""
with open(db, "rb") as f:
keys = set()
try:
while True:
e = pickle.load(f)
k = e.entry
k = re.sub("\([^)]*\)", "", k)
k = re.sub("\[[^]]*\]", "", k)
keys.add(k)
except EOFError:
pass
for k in keys:
out.write('%s\n' % k)
if __name__ == "__main__":
import optparse
parser = optparse.OptionParser(__doc__.strip())
parser.add_option(
"-d", "--db", help="database name", dest="db", default="pylookup.db")
parser.add_option("-l", "--lookup", help="keyword to search", dest="key")
parser.add_option(
"-u",
"--update",
help="update url or path",
action="append",
type="str",
dest="url")
parser.add_option(
"-c",
"--cache",
help="extract keywords, internally used",
action="store_true",
default=False,
dest="cache")
parser.add_option(
"-a",
"--append",
help="append to the db from multiple sources",
action="store_true",
default=False,
dest="append")
parser.add_option(
"-f",
"--format",
help="type of output formatting, valid: Emacs, Terminal",
choices=["Emacs", "Terminal"],
default="Terminal",
dest="format")
parser.add_option(
"-i",
"--insensitive",
default=1,
choices=['0', '1'],
help="SEARCH OPTION: insensitive search "
"(valid: 0, 1; default: %default)")
parser.add_option(
"-s",
"--desc",
default=1,
choices=['0', '1'],
help="SEARCH OPTION: include description field "
"(valid: 0, 1; default: %default)")
parser.add_option(
"-v",
"--verbose",
help="verbose",
action="store_true",
dest="verbose",
default=False)
(opts, args) = parser.parse_args()
VERBOSE = opts.verbose
if opts.url:
update(opts.db, opts.url, opts.append)
if opts.cache:
cache(opts.db)
if opts.key:
lookup(
opts.db,
opts.key,
FORMATS[opts.format],
insensitive=int(opts.insensitive),
desc=int(opts.desc))