spacemacs/layers/+lang/python/local/pylookup/pylookup.py

330 lines
10 KiB
Python
Raw Normal View History

#!/usr/bin/env python2
"""
Pylookup is to lookup entries from python documentation, especially within
emacs. Pylookup adopts most of ideas from haddoc, lovely toolkit by Martin
Blais.
(usage)
./pylookup.py -l ljust
./pylookup.py -u http://docs.python.org
"""
from __future__ import with_statement
import os
import sys
import re
try:
import cPickle as pickle
except:
import pickle
import formatter
from os.path import join, dirname, exists, abspath, expanduser
from contextlib import closing
if sys.version_info[0] == 3:
import html.parser as htmllib
import urllib.parse as urlparse
import urllib.request as urllib
else:
import htmllib, urllib, urlparse
VERBOSE = False
FORMATS = {
"Emacs" : "{entry}\t({desc})\t[{book}];{url}",
"Terminal" : "{entry}\t({desc})\t[{book}]\n{url}"
}
def build_book(s, num):
"""
Build book identifier from `s`, with `num` links.
"""
for matcher, replacement in (("library", "lib"),
("c-api", "api"),
("reference", "ref"),
("", "etc")):
if matcher in s:
return replacement if num == 1 else "%s/%d" % (replacement, num)
def trim(s):
"""
Add any globle filtering rules here
"""
s = s.replace( "Python Enhancement Proposals!", "")
s = s.replace( "PEP ", "PEP-")
return s
class Element(object):
def __init__(self, entry, desc, book, url):
self.book = book
self.url = url
self.desc = desc
self.entry = entry
def __format__(self, format_spec):
return format_spec.format(entry=self.entry, desc=self.desc,
book=self.book, url=self.url)
def match_insensitive(self, key):
"""
Match key case insensitive against entry and desc.
`key` : Lowercase string.
"""
return key in self.entry.lower() or key in self.desc.lower()
def match_sensitive(self, key):
"""
Match key case sensitive against entry and desc.
`key` : Lowercase string.
"""
return key in self.entry or key in self.desc
def match_in_entry_insensitive(self, key):
"""
Match key case insensitive against entry.
`key` : Lowercase string.
"""
return key in self.entry.lower()
def match_in_entry_sensitive(self, key):
"""
Match key case sensitive against entry.
`key` : Lowercase string.
"""
return key in self.entry
def get_matcher(insensitive=True, desc=True):
"""
Get `Element.match_*` function.
>>> get_matcher(0, 0)
<unbound method Element.match_in_entry_sensitive>
>>> get_matcher(1, 0)
<unbound method Element.match_in_entry_insensitive>
>>> get_matcher(0, 1)
<unbound method Element.match_sensitive>
>>> get_matcher(1, 1)
<unbound method Element.match_insensitive>
"""
_sensitive = "_insensitive" if insensitive else "_sensitive"
_in_entry = "" if desc else "_in_entry"
return getattr(Element, "match{0}{1}".format(_in_entry, _sensitive))
class IndexProcessor( htmllib.HTMLParser ):
"""
Extract the index links from a Python HTML documentation index.
"""
def __init__( self, writer, dirn):
htmllib.HTMLParser.__init__( self, formatter.NullFormatter() )
self.writer = writer
self.dirn = dirn
self.entry = ""
self.desc = ""
self.list_entry = False
self.do_entry = False
self.one_entry = False
self.num_of_a = 0
self.desc_cnt = 0
def start_dd( self, att ):
self.list_entry = True
def end_dd( self ):
self.list_entry = False
def start_dt( self, att ):
self.one_entry = True
self.num_of_a = 0
def end_dt( self ):
self.do_entry = False
def start_a( self, att ):
if self.one_entry:
self.url = join( self.dirn, dict( att )[ 'href' ] )
self.save_bgn()
def end_a( self ):
global VERBOSE
if self.one_entry:
if self.num_of_a == 0 :
self.desc = self.save_end()
if VERBOSE:
self.desc_cnt += 1
if self.desc_cnt % 100 == 0:
sys.stdout.write("%04d %s\r" \
% (self.desc_cnt, self.desc.ljust(80)))
# extract fist element
# ex) __and__() (in module operator)
if not self.list_entry :
self.entry = re.sub( "\([^)]+\)", "", self.desc )
# clean up PEP
self.entry = trim(self.entry)
match = re.search( "\([^)]+\)", self.desc )
if match :
self.desc = match.group(0)
self.desc = trim(re.sub( "[()]", "", self.desc ))
self.num_of_a += 1
book = build_book(self.url, self.num_of_a)
e = Element(self.entry, self.desc, book, self.url)
self.writer(e)
def update(db, urls, append=False):
"""Update database with entries from urls.
`db` : filename to database
`urls` : list of URL
`append` : append to db
"""
mode = "ab" if append else "wb"
with open(db, mode) as f:
writer = lambda e: pickle.dump(e, f)
for url in urls:
# detech 'file' or 'url' schemes
parsed = urlparse.urlparse(url)
if not parsed.scheme or parsed.scheme == "file":
dst = abspath(expanduser(parsed.path))
if not os.path.exists(dst):
print("Error: %s doesn't exist" % dst)
exit(1)
url = "file://%s" % dst
else:
url = parsed.geturl()
potential_urls = []
if url.endswith('.html'):
potential_urls.append(url)
else:
# guess index URLs
# for stdlib, this is genindex-all.html
# for django, numpy, etc. it's genindex.html
url = url.rstrip("/")
potential_urls.append(url + "/genindex-all.html")
potential_urls.append(url + "/genindex.html")
success = False
for index_url in potential_urls:
try:
print "Wait for a few seconds..."
print "Fetching index from '%s'" % index_url
index = urllib.urlopen(index_url).read()
if not issubclass(type(index), str):
index = index.decode()
parser = IndexProcessor(writer, dirname(index_url))
with closing(parser):
parser.feed(index)
# success, we don't need to try other potential urls
print "Loaded index from '%s'" % index_url
success = True
break
except IOError:
print "Error: fetching file from '%s'" % index_url
if not success:
print "Failed to load index for input '%s'" % url
def lookup(db, key, format_spec, out=sys.stdout, insensitive=True, desc=True):
"""Lookup key from database and print to out.
`db` : filename to database
`key` : key to lookup
`out` : file-like to write to
`insensitive` : lookup key case insensitive
"""
matcher = get_matcher(insensitive, desc)
if insensitive:
key = key.lower()
with open(db, "rb") as f:
try:
while True:
e = pickle.load(f)
if matcher(e, key):
out.write('%s\n' % format(e, format_spec))
except EOFError:
pass
def cache(db, out=sys.stdout):
"""Print unique entries from db to out.
`db` : filename to database
`out` : file-like to write to
"""
with open(db, "rb") as f:
keys = set()
try:
while True:
e = pickle.load(f)
k = e.entry
k = re.sub( "\([^)]*\)", "", k )
k = re.sub( "\[[^]]*\]", "", k )
keys.add(k)
except EOFError:
pass
for k in keys:
out.write('%s\n' % k)
if __name__ == "__main__":
import optparse
parser = optparse.OptionParser( __doc__.strip() )
parser.add_option( "-d", "--db",
help="database name",
dest="db", default="pylookup.db" )
parser.add_option( "-l", "--lookup",
help="keyword to search",
dest="key" )
parser.add_option( "-u", "--update",
help="update url or path",
action="append", type="str", dest="url" )
parser.add_option( "-c", "--cache" ,
help="extract keywords, internally used",
action="store_true", default=False, dest="cache")
parser.add_option( "-a", "--append",
help="append to the db from multiple sources",
action="store_true", default=False, dest="append")
parser.add_option( "-f", "--format",
help="type of output formatting, valid: Emacs, Terminal",
choices=["Emacs", "Terminal"],
default="Terminal", dest="format")
parser.add_option( "-i", "--insensitive", default=1, choices=['0', '1'],
help="SEARCH OPTION: insensitive search "
"(valid: 0, 1; default: %default)")
parser.add_option( "-s", "--desc", default=1, choices=['0', '1'],
help="SEARCH OPTION: include description field "
"(valid: 0, 1; default: %default)")
parser.add_option("-v", "--verbose",
help="verbose", action="store_true",
dest="verbose", default=False)
( opts, args ) = parser.parse_args()
VERBOSE = opts.verbose
if opts.url:
update(opts.db, opts.url, opts.append)
if opts.cache:
cache(opts.db)
if opts.key:
lookup(opts.db, opts.key, FORMATS[opts.format],
insensitive=int(opts.insensitive), desc=int(opts.desc))