From 959fd01644601e04595cfb3690fc3dfc9d2e87aa Mon Sep 17 00:00:00 2001 From: George Date: Sat, 17 Jan 2015 05:10:21 +0100 Subject: [PATCH 1/3] Replaced PyQuery with BeautifulSoup4 and lxml. Parses UTF properly now. Adjusted README, requirements. Code contains TODO:s --- README.rst | 20 ++++++- buster/_version.py | 2 +- buster/buster.py | 145 ++++++++++++++++++++++++++++++++++----------- requirements.txt | 4 +- 4 files changed, 132 insertions(+), 39 deletions(-) diff --git a/README.rst b/README.rst index 05026ea..12b1afa 100644 --- a/README.rst +++ b/README.rst @@ -17,7 +17,7 @@ The interface       Creates a GIT repository inside ``static/`` directory. -``generate [--domain=]`` +``generate [--domain=] [--dir=] [--target=] [--replace-all=]``       Generates static pages from locally running Ghost instance. @@ -38,6 +38,15 @@ Buster assumes you have ``static/`` folder in your current directory (or creates one during ``setup`` command). You can specify custom directory path using ``[--dir=]`` option to any of the above commands. +The ``[--replace-all=]`` option switches between replacing +all urls buster can find, or only ``href`` attributes in ``a`` tags. + +The ``[--target=]`` option let's you choose the target +domain and root directory of the generated site. This is especially +needed for the RSS/Atom feed that would otherwise point to ``--domain``. +The option provides an alternative to changing your blog URL in Ghost's +config.js (see below). + Don't forget to change your blog URL in config.js in Ghost. @@ -68,6 +77,15 @@ installed via ``pip``: command line interfaces *easily*. - `GitPython `__: Python interface for GIT. + `BeautifulSoup4 `__: + Painlessly parses and creates (x)HTML(5) + `lxml `__: XML/HTML processor. + +Example +_______ + +Generate a static version of your ghost blog with all links replaces via +``buster generate --domain=http://localhost:2368 --target=https://foo.com --replace-all=yes`` Ghost. What? ------------ diff --git a/buster/_version.py b/buster/_version.py index 64988fe..07954f5 100644 --- a/buster/_version.py +++ b/buster/_version.py @@ -1,2 +1,2 @@ -__version_info__ = (0, 1, 3) +__version_info__ = (0, 1, 3, "bs4") __version__ = '.'.join(map(str, __version_info__)) diff --git a/buster/buster.py b/buster/buster.py index 4e6e532..82e477a 100644 --- a/buster/buster.py +++ b/buster/buster.py @@ -2,7 +2,7 @@ Usage: buster.py setup [--gh-repo=] [--dir=] - buster.py generate [--domain=] [--dir=] + buster.py generate [--domain=] [--dir=] [--target=] [--replace-all=] buster.py preview [--dir=] buster.py deploy [--dir=] buster.py add-domain [--dir=] @@ -12,8 +12,10 @@ Options: -h --help Show this screen. --version Show version. - --dir= Absolute path of directory to store static pages. + --dir= Absolute path of local directory to store static pages. --domain= Address of local ghost installation [default: localhost:2368]. + --target= Address of target root URL (e.g. https://domain.com/path/to/root) [default: --domain] + --replace-all= Whether to only replace URLs found in a tags, or all occurences of --domain [default: no] --gh-repo= URL of your gh-pages repository. """ @@ -27,16 +29,37 @@ from docopt import docopt from time import gmtime, strftime from git import Repo -from pyquery import PyQuery - +from bs4 import BeautifulSoup +from lxml import etree, html +from io import StringIO, BytesIO def main(): - arguments = docopt(__doc__, version='0.1.3') + # TODO: arguments should be handled with argparse (https://docs.python.org/2/library/argparse.html) + arguments = docopt(__doc__, version='0.1.3.bs4') if arguments['--dir'] is not None: static_path = arguments['--dir'] else: static_path = os.path.join(os.getcwd(), 'static') - + + # set default --domain to localhost:2368, as in the description above + if arguments['--domain'] is not None: + local_domain = arguments['--domain'] + else: + local_domain = "http://localhost:2368" + + # make sure that --target is set as well + # (this is needed for RSS, since otherwise urls in the feed resolve to the local_domain) + if arguments['--target'] is not None: + target_root = arguments['--target'] + else: + target_root = local_domain + + # set scope for url replacement (i.e. only tags or everything) + if arguments['--replace-all'] == "yes": + replace_all = True + else: + replace_all = False + if arguments['generate']: command = ("wget " "--recursive " # follow links to download entire site @@ -46,9 +69,13 @@ def main(): "--directory-prefix {1} " # download contents to static/ folder "--no-host-directories " # don't create domain named folder "--restrict-file-name=unix " # don't escape query string - "{0}").format(arguments['--domain'], static_path) + "{0}").format(local_domain, static_path) os.system(command) + + # init list of renamed files + files = list() + # remove query string since Ghost 0.4 file_regex = re.compile(r'.*?(\?.*)') for root, dirs, filenames in os.walk(static_path): @@ -57,39 +84,87 @@ def main(): newname = re.sub(r'\?.*', '', filename) print "Rename", filename, "=>", newname os.rename(os.path.join(root, filename), os.path.join(root, newname)) - + files.append(newname) # add new name to file-list + # remove superfluous "index.html" from relative hyperlinks found in text abs_url_regex = re.compile(r'^(?:[a-z]+:)?//', flags=re.IGNORECASE) - def fixLinks(text, parser): - d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) - for element in d('a'): - e = PyQuery(element) - href = e.attr('href') - if not abs_url_regex.search(href): - new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) - new_href = re.sub(r'/index\.html$', '/', new_href) - e.attr('href', new_href) - print "\t", href, "=>", new_href - if parser == 'html': - return d.html(method='html').encode('utf8') - return d.__unicode__().encode('utf8') + + # form regex from file-list (i.e. all files, with stripped arguments from above) + url_suffix_regex = re.compile(r'(' + "|".join(files) + r')(\?.*?(?=\"))', flags = re.IGNORECASE) + + def repl(m): # select regex matching group + print "---Removing", m.group(2), "from", m.group(1) + return m.group(1) + + def fixAllUrls(data, parser, encoding): + + # step 1: + # load HTML/XML in lxml + if parser == "xml": # parser for XML that keeps CDATA elements (beautifulsoup doesn't) + parser = etree.XMLParser(encoding=encoding, strip_cdata=False, resolve_entities=True) + data = etree.XML(data, parser) + # format the parsed xml for output (keep all non-ascii chars inside CDATA) + data = etree.tostring(data, pretty_print=True, encoding=encoding) + if parser == "lxml": # parser for HTML ("lenient" setting for html5) + + # the following should work, but spits out utf-8 numeric character references... + + # parser = etree.HTMLParser(encoding=encoding, strip_cdata=False) + # if isinstance(data, str): + # data = unicode(data, encoding) + # data = etree.parse(StringIO(data), parser) + # data = html.tostring(data.getroot(), pretty_print=True, method="html") + # data = u'\n' + data + + # BeautifulSoup outputs html entities with formatter="html". lxml above should be faster + data = BeautifulSoup(data, "html5lib").prettify(encoding,formatter="minimal") + + # step 2: + # substitute all occurences of --domain (local_domain) argument with --target (target_root) + data = re.sub(local_domain, target_root, data) + + # step 3: + # remove URL arguments (e.g. query string) from renamed files + # TODO: make it work with googlefonts + data = url_suffix_regex.sub(repl, data) + return data + + def fixUrls(data, parser, encoding): + # Is this is a HTML document AND are we looking only for tags? + if parser == 'lxml' and not replace_all: + soup = BeautifulSoup(data, parser) # TODO: replace beautifulsoup with lxml (still beats pyQuery, though) + # adjust all href attributes of html-link elements + for a in soup.findAll('a'): # for each element + if not abs_url_regex.search(a['href']): # that is not an absolute URL + new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', a['href']) # adjust href case 1 + new_href = re.sub(r'/index\.html$', '/', new_href) # adjust href case 2, + print "\t", a['href'], "=>", new_href # tell about it, + a['href'] = a['href'].replace(a['href'], new_href) # perform replacement, and + return soup.prettify(encoding,formatter="html") # return pretty utf-8 html with encoded html entities + + # Otherwise, fall through to fixAllUrls() for all other cases + # (XML needs to always go through here AND we want to replace all URLs) + return fixAllUrls(data, parser, encoding) # fix links in all html files for root, dirs, filenames in os.walk(static_path): for filename in fnmatch.filter(filenames, "*.html"): - filepath = os.path.join(root, filename) - parser = 'html' - if root.endswith("/rss"): # rename rss index.html to index.rss - parser = 'xml' - newfilepath = os.path.join(root, os.path.splitext(filename)[0] + ".rss") - os.rename(filepath, newfilepath) - filepath = newfilepath - with open(filepath) as f: - filetext = f.read().decode('utf8') - print "fixing links in ", filepath - newtext = fixLinks(filetext, parser) - with open(filepath, 'w') as f: - f.write(newtext) + filepath = os.path.join(root, filename) + parser = 'lxml' # beautifulsoup parser selection (i.e. lxml) + if root.endswith("/rss"): # rename rss index.html to index.rss, TODO: implement support for sitemap + parser = 'xml' # select xml parser for this file + newfilepath = os.path.join(root, os.path.splitext(filename)[0] + ".rss") + os.rename(filepath, newfilepath) + filepath = newfilepath + with open(filepath) as f: + filetext = f.read() # beautifulsoup: convert anything to utf-8 via unicode,dammit + print "Fixing links in ", filepath + # define output encoding, in case you want something else + # (not that this matters, since we escape non-ascii chars in html as html entities) + encoding = "utf-8" + newtext = fixUrls(filetext, parser, encoding) + with open(filepath, 'w') as f: + f.write(newtext) elif arguments['preview']: os.chdir(static_path) @@ -163,4 +238,4 @@ def fixLinks(text, parser): print __doc__ if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 68b93d1..11e5a76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,6 @@ GitPython==0.3.2.RC1 async==0.6.1 docopt==0.6.1 gitdb==0.5.4 -pyquery==1.2.8 smmap==0.8.2 -pyquery==1.2.8 +beautifulsoup4>=4.3.2 +lxml>=3.4.1 \ No newline at end of file From b6cd08baa1de174e540767a19fc23c0542911ee0 Mon Sep 17 00:00:00 2001 From: geotti Date: Tue, 20 Jan 2015 23:49:07 +0100 Subject: [PATCH 2/3] Replace docopt with argparse more control Signed-off-by: geotti --- .gitignore | 2 + buster/_version.py | 2 +- buster/buster.py | 178 +++++++++++++++++++++++++-------------------- requirements.txt | 3 +- usage.txt | 20 +++++ 5 files changed, 126 insertions(+), 79 deletions(-) create mode 100644 usage.txt diff --git a/.gitignore b/.gitignore index ded6067..c09c9fb 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,5 @@ nosetests.xml .mr.developer.cfg .project .pydevproject + +.DS_Store diff --git a/buster/_version.py b/buster/_version.py index 07954f5..0e05674 100644 --- a/buster/_version.py +++ b/buster/_version.py @@ -1,2 +1,2 @@ -__version_info__ = (0, 1, 3, "bs4") +__version_info__ = (0, 1, 3, "bs4",2) __version__ = '.'.join(map(str, __version_info__)) diff --git a/buster/buster.py b/buster/buster.py index 82e477a..6b5da34 100644 --- a/buster/buster.py +++ b/buster/buster.py @@ -1,22 +1,4 @@ """Ghost Buster. Static site generator for Ghost. - -Usage: - buster.py setup [--gh-repo=] [--dir=] - buster.py generate [--domain=] [--dir=] [--target=] [--replace-all=] - buster.py preview [--dir=] - buster.py deploy [--dir=] - buster.py add-domain [--dir=] - buster.py (-h | --help) - buster.py --version - -Options: - -h --help Show this screen. - --version Show version. - --dir= Absolute path of local directory to store static pages. - --domain= Address of local ghost installation [default: localhost:2368]. - --target= Address of target root URL (e.g. https://domain.com/path/to/root) [default: --domain] - --replace-all= Whether to only replace URLs found in a tags, or all occurences of --domain [default: no] - --gh-repo= URL of your gh-pages repository. """ import os @@ -26,41 +8,86 @@ import shutil import SocketServer import SimpleHTTPServer -from docopt import docopt from time import gmtime, strftime from git import Repo +from io import StringIO, BytesIO from bs4 import BeautifulSoup from lxml import etree, html -from io import StringIO, BytesIO +import argparse def main(): - # TODO: arguments should be handled with argparse (https://docs.python.org/2/library/argparse.html) - arguments = docopt(__doc__, version='0.1.3.bs4') - if arguments['--dir'] is not None: - static_path = arguments['--dir'] - else: - static_path = os.path.join(os.getcwd(), 'static') - - # set default --domain to localhost:2368, as in the description above - if arguments['--domain'] is not None: - local_domain = arguments['--domain'] - else: - local_domain = "http://localhost:2368" - - # make sure that --target is set as well - # (this is needed for RSS, since otherwise urls in the feed resolve to the local_domain) - if arguments['--target'] is not None: - target_root = arguments['--target'] - else: - target_root = local_domain - # set scope for url replacement (i.e. only tags or everything) - if arguments['--replace-all'] == "yes": - replace_all = True - else: - replace_all = False +# Declare argparse options + parser = argparse.ArgumentParser(description='Ghost Buster. Static site generator for Ghost.', + version='0.1.3.bs4.2', + prog='buster', + add_help=True, + epilog='Powered by ectoplasm.', + formatter_class=argparse.RawDescriptionHelpFormatter, + usage=''' + buster setup [-h] [-p output/dir] repository + buster generate [-h] [--path output/dir] (--replace-all | --replace-tags) [source-url] [target-url] + buster preview [-h] [--path [output/dir]] + buster deploy [-h] [--path [output/dir]] + buster add-domain [-h] [--path [output/dir]] target-domain + buster -h, --help + buster -v, --version + ''') + parser._optionals.title = "options" + +# Init subparsers + subparsers = parser.add_subparsers(dest='current_action', title='actions', description='''Choose an action\n(type "%(prog)s action -h" for additional instructions)''') + +# Setup command + setup_parser = subparsers.add_parser('setup', help='Setup Github repository') + setup_parser._positionals.title = "required" + setup_parser._optionals.title = "options" + setup_parser.add_argument('repository', action='store', metavar='repository', help='URL of your gh-pages repository.') + setup_parser.add_argument('-p', '--path', action='store', dest='static_path', default='static', metavar='output/dir', help='Output path of local directory to store static pages. (default: static)') + + +# Generate command + generate_parser = subparsers.add_parser('generate', help='Bust the Ghost') + generate_parser._positionals.title = "required" + generate_parser.add_argument('source', action='store', default='http://localhost:2368', metavar='source-url', nargs='?', help='Address of local Ghost installation (default: http:/localhost:2368)') + generate_parser.add_argument('-p', '--path', action='store', dest='static_path', default='static', metavar='output/dir', help='Output path of local directory to store static pages. (default: static)') + generate_parser.add_argument('target', action='store', metavar='target-url', default='http://localhost:2368', nargs='?', help='Address of target root URL (e.g. https://domain.com/path/to/root)') + # replacement switch + #TODO: Add more choices to define what to replace with --replace-all switch + group = generate_parser.add_mutually_exclusive_group(required=True) + group.add_argument('--replace-all', '-a', dest='replace', action='store_true', help='Replace all occurences of source-url') + group.add_argument('--replace-tags', '-t', dest='replace', action='store_false', help='Replace only URLs found in tags') + +# Preview command + preview_parser = subparsers.add_parser('preview', help='Local preview', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + preview_parser._optionals.title = "options" + preview_parser.add_argument('-p', '--path', action='store', dest='static_path', default='static', metavar='output/dir', nargs="?", help='Output path of local directory to store static pages. (default: static)') - if arguments['generate']: +# Deploy command + deploy_parser = subparsers.add_parser('deploy', help='Deploy to Github pages', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + deploy_parser._optionals.title = "options" + deploy_parser.add_argument('-p', '--path', action='store', dest='static_path', default='static', metavar='output/dir', nargs='?', help='Output path of local directory to store static pages. (default: static)') + +# Add-Domain command + add_parser = subparsers.add_parser('add-domain', help='Add CNAME to repository', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + add_parser._positionals.title = "required" + add_parser._optionals.title = "options" + add_parser.add_argument('target', action="store", metavar='target-domain', help='Address of target URL') + add_parser.add_argument('-p', '--path', action='store', dest='static_path', metavar='output/dir', nargs='?', help='Output path of local directory to store static pages. (default: static)') + + # print help, when run without arguments + if len(sys.argv)==1: + parser.print_help() + sys.exit(1) + args=parser.parse_args() + + print "Running: buster " + args.current_action + + # simplify comparison + action = args.current_action + + + if action == 'generate': command = ("wget " "--recursive " # follow links to download entire site "--convert-links " # make links relative @@ -69,16 +96,15 @@ def main(): "--directory-prefix {1} " # download contents to static/ folder "--no-host-directories " # don't create domain named folder "--restrict-file-name=unix " # don't escape query string - "{0}").format(local_domain, static_path) + "{0}").format(args.source, args.static_path) os.system(command) - # init list of renamed files files = list() # remove query string since Ghost 0.4 file_regex = re.compile(r'.*?(\?.*)') - for root, dirs, filenames in os.walk(static_path): + for root, dirs, filenames in os.walk(args.static_path): for filename in filenames: if file_regex.match(filename): newname = re.sub(r'\?.*', '', filename) @@ -120,8 +146,8 @@ def fixAllUrls(data, parser, encoding): data = BeautifulSoup(data, "html5lib").prettify(encoding,formatter="minimal") # step 2: - # substitute all occurences of --domain (local_domain) argument with --target (target_root) - data = re.sub(local_domain, target_root, data) + # substitute all occurences of --source-url (args.source) argument with --target-url (args.target) + data = re.sub(args.source, args.target, data) # step 3: # remove URL arguments (e.g. query string) from renamed files @@ -131,14 +157,14 @@ def fixAllUrls(data, parser, encoding): def fixUrls(data, parser, encoding): # Is this is a HTML document AND are we looking only for tags? - if parser == 'lxml' and not replace_all: + if parser == 'lxml' and not args.replace: soup = BeautifulSoup(data, parser) # TODO: replace beautifulsoup with lxml (still beats pyQuery, though) # adjust all href attributes of html-link elements for a in soup.findAll('a'): # for each element if not abs_url_regex.search(a['href']): # that is not an absolute URL - new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', a['href']) # adjust href case 1 - new_href = re.sub(r'/index\.html$', '/', new_href) # adjust href case 2, - print "\t", a['href'], "=>", new_href # tell about it, + new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', a['href']) # adjust href 1 (rss feed) + new_href = re.sub(r'/index\.html$', '/', new_href) # adjust href 2 (dir index), + print "\t", a['href'], "=>", new_href # brag about it, a['href'] = a['href'].replace(a['href'], new_href) # perform replacement, and return soup.prettify(encoding,formatter="html") # return pretty utf-8 html with encoded html entities @@ -147,11 +173,11 @@ def fixUrls(data, parser, encoding): return fixAllUrls(data, parser, encoding) # fix links in all html files - for root, dirs, filenames in os.walk(static_path): + for root, dirs, filenames in os.walk(args.static_path): for filename in fnmatch.filter(filenames, "*.html"): filepath = os.path.join(root, filename) parser = 'lxml' # beautifulsoup parser selection (i.e. lxml) - if root.endswith("/rss"): # rename rss index.html to index.rss, TODO: implement support for sitemap + if root.endswith("/rss"): # rename index.html in .../rss to index.rss, TODO: implement support for sitemap parser = 'xml' # select xml parser for this file newfilepath = os.path.join(root, os.path.splitext(filename)[0] + ".rss") os.rename(filepath, newfilepath) @@ -166,8 +192,8 @@ def fixUrls(data, parser, encoding): with open(filepath, 'w') as f: f.write(newtext) - elif arguments['preview']: - os.chdir(static_path) + elif action == 'preview': + os.chdir(args.static_path) Handler = SimpleHTTPServer.SimpleHTTPRequestHandler httpd = SocketServer.TCPServer(("", 9000), Handler) @@ -176,19 +202,17 @@ def fixUrls(data, parser, encoding): # gracefully handle interrupt here httpd.serve_forever() - elif arguments['setup']: - if arguments['--gh-repo']: - repo_url = arguments['--gh-repo'] - else: - repo_url = raw_input("Enter the Github repository URL:\n").strip() + elif action == 'setup': + + repo_url = args.repository # Create a fresh new static files directory - if os.path.isdir(static_path): - confirm = raw_input("This will destroy everything inside static/." - " Are you sure you want to continue? (y/N)").strip() + if os.path.isdir(args.static_path): + confirm = raw_input("This will destroy everything inside " + args.static_path + + " Are you sure you wish to continue? (y/N)").strip() if confirm != 'y' and confirm != 'Y': sys.exit(0) - shutil.rmtree(static_path) + shutil.rmtree(args.static_path) # User/Organization page -> master branch # Project page -> gh-pages branch @@ -198,7 +222,7 @@ def fixUrls(data, parser, encoding): branch = 'master' # Prepare git repository - repo = Repo.init(static_path) + repo = Repo.init(args.static_path) git = repo.git if branch == 'gh-pages': @@ -206,14 +230,14 @@ def fixUrls(data, parser, encoding): repo.create_remote('origin', repo_url) # Add README - file_path = os.path.join(static_path, 'README.md') + file_path = os.path.join(args.static_path, 'README.md') with open(file_path, 'w') as f: f.write('# Blog\nPowered by [Ghost](http://ghost.org) and [Buster](https://github.com/axitkhurana/buster/).\n') print "All set! You can generate and deploy now." - elif arguments['deploy']: - repo = Repo(static_path) + elif action == 'deploy': + repo = Repo(args.static_path) repo.git.add('.') current_time = strftime("%Y-%m-%d %H:%M:%S", gmtime()) @@ -224,18 +248,18 @@ def fixUrls(data, parser, encoding): repo.active_branch.name]) print "Good job! Deployed to Github Pages." - elif arguments['add-domain']: - repo = Repo(static_path) - custom_domain = arguments[''] + elif action == 'add-domain': + repo = Repo(args.static_path) + custom_domain = args.target - file_path = os.path.join(static_path, 'CNAME') + file_path = os.path.join(args.static_path, 'CNAME') with open(file_path, 'w') as f: f.write(custom_domain + '\n') print "Added CNAME file to repo. Use `deploy` to deploy" - else: - print __doc__ + else: # probably unnecessary + parser.print_help() if __name__ == '__main__': main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 11e5a76..093f574 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ docopt==0.6.1 gitdb==0.5.4 smmap==0.8.2 beautifulsoup4>=4.3.2 -lxml>=3.4.1 \ No newline at end of file +lxml>=3.4.1 +argparse>=1.3.0 \ No newline at end of file diff --git a/usage.txt b/usage.txt new file mode 100644 index 0000000..e0e677c --- /dev/null +++ b/usage.txt @@ -0,0 +1,20 @@ +"""Ghost Buster. Static site generator for Ghost. +Usage: + buster setup [-h] [-p output/dir] repository + buster generate [-h] [--path output/dir] (--replace-all | --replace-tags) [source-url] [target-url] + buster preview [-h] [--path [output/dir]] + buster deploy [-h] [--path [output/dir]] + buster add-domain [-h] [--path [output/dir]] target-domain + buster -h, --help + buster -v, --version + +Options: + -h, --help Show this screen. + -v, --version Show version. + -p, --path output/dir Output path of local directory to store static pages. [default: static] + source-url Address of local Ghost installation [default: http://localhost:2368]. + target-url Address of target root URL (e.g. https://domain.com/path/to/root) [default: http://localhost:2368] + (-a | -t), (--replace-all + |--replace-tags) Replace all occurrences of source-url, or only in tags + --repository URL of your gh-pages repository. +""" \ No newline at end of file From 2ffbd66df90ae5af9166a8b787c65329369f0069 Mon Sep 17 00:00:00 2001 From: geotti Date: Wed, 21 Jan 2015 00:46:07 +0100 Subject: [PATCH 3/3] Fixed RSS Link Signed-off-by: geotti --- README.rst | 34 ++++++++++++++++++++---------- buster/_version.py | 2 +- buster/buster.py | 52 ++++++++++++++++++++++++---------------------- requirements.txt | 1 - setup.py | 2 +- usage.txt | 2 +- 6 files changed, 53 insertions(+), 40 deletions(-) diff --git a/README.rst b/README.rst index 12b1afa..72ff0c5 100644 --- a/README.rst +++ b/README.rst @@ -10,30 +10,40 @@ Start with a clean, no commits Github repository. Warning! This project is a hack. It's not official. But it works for me. -The interface -------------- +The interface commands +---------------------- -``setup [--gh-repo=]`` +``setup [--path output/dir] repository`` -      Creates a GIT repository inside ``static/`` directory. +      Creates a GIT repository inside ``--path`` directory. -``generate [--domain=] [--dir=] [--target=] [--replace-all=]`` +``generate [--path output/dir] (--replace-all | --replace-tags) [source-url] [target-url]``       Generates static pages from locally running Ghost instance. +``--replace-all`` substitutes all ``source-url`` instances with value of ``target-url`` -``preview`` +``preview [--path [output/dir]]``       Preview what's generated on ``localhost:9000``. -``deploy`` +``deploy [--path [output/dir]]``       Commits and deploys changes static files to Github repository. -``add-domain `` +``add-domain [--path [output/dir]] target-domain``       Adds CNAME file with custom domain name as required by Github Pages. +``buster command -h`` +      Outputs additional usage information for a command + +``buster -h`` +      Outputs top-level help + +``buster -v`` +      Prints the current buster version. + Buster assumes you have ``static/`` folder in your current directory (or creates one during ``setup`` command). You can specify custom directory path using ``[--dir=]`` option to any of the above commands. @@ -73,8 +83,8 @@ Requirements The following python packages would be installed automatically when installed via ``pip``: -- `docopt `__: Creates beautiful - command line interfaces *easily*. +- `argparse `__: Creates + powerful, functional command line interfaces. - `GitPython `__: Python interface for GIT. `BeautifulSoup4 `__: @@ -85,7 +95,7 @@ Example _______ Generate a static version of your ghost blog with all links replaces via -``buster generate --domain=http://localhost:2368 --target=https://foo.com --replace-all=yes`` +``buster generate http://localhost:2368 https://foo.com --path /output/dir --replace-all`` Ghost. What? ------------ @@ -120,3 +130,5 @@ new one. Pull requests welcome! *Made with* `jugaad `__ *in* `Dilli `__. + +*Powered by ectoplasm.* diff --git a/buster/_version.py b/buster/_version.py index 0e05674..cfb8032 100644 --- a/buster/_version.py +++ b/buster/_version.py @@ -1,2 +1,2 @@ -__version_info__ = (0, 1, 3, "bs4",2) +__version_info__ = (0, 1, 3, "bs4",3) __version__ = '.'.join(map(str, __version_info__)) diff --git a/buster/buster.py b/buster/buster.py index 6b5da34..b09891e 100644 --- a/buster/buster.py +++ b/buster/buster.py @@ -19,20 +19,11 @@ def main(): # Declare argparse options parser = argparse.ArgumentParser(description='Ghost Buster. Static site generator for Ghost.', - version='0.1.3.bs4.2', + version='0.1.3.bs4.3', prog='buster', add_help=True, epilog='Powered by ectoplasm.', - formatter_class=argparse.RawDescriptionHelpFormatter, - usage=''' - buster setup [-h] [-p output/dir] repository - buster generate [-h] [--path output/dir] (--replace-all | --replace-tags) [source-url] [target-url] - buster preview [-h] [--path [output/dir]] - buster deploy [-h] [--path [output/dir]] - buster add-domain [-h] [--path [output/dir]] target-domain - buster -h, --help - buster -v, --version - ''') + formatter_class=argparse.RawDescriptionHelpFormatter) parser._optionals.title = "options" # Init subparsers @@ -127,22 +118,29 @@ def fixAllUrls(data, parser, encoding): # step 1: # load HTML/XML in lxml if parser == "xml": # parser for XML that keeps CDATA elements (beautifulsoup doesn't) + print "Fixing XML" parser = etree.XMLParser(encoding=encoding, strip_cdata=False, resolve_entities=True) data = etree.XML(data, parser) # format the parsed xml for output (keep all non-ascii chars inside CDATA) data = etree.tostring(data, pretty_print=True, encoding=encoding) if parser == "lxml": # parser for HTML ("lenient" setting for html5) + print "Fixing HTML" # the following should work, but spits out utf-8 numeric character references... - + # TODO: FIXME # parser = etree.HTMLParser(encoding=encoding, strip_cdata=False) # if isinstance(data, str): # data = unicode(data, encoding) # data = etree.parse(StringIO(data), parser) # data = html.tostring(data.getroot(), pretty_print=True, method="html") - # data = u'\n' + data + # data = u'\n' + unicode(data) + + # go through fixTagsOnly (we'll be calling bs4 twice until above is fixed...) + data = fixTagsOnly(data, parser, encoding) - # BeautifulSoup outputs html entities with formatter="html". lxml above should be faster + # BeautifulSoup outputs html entities with formatter="html" (if you need them). + # lxml above should be faster, but outputs utf-8 numeric char refs + print "Fixing remaining links" data = BeautifulSoup(data, "html5lib").prettify(encoding,formatter="minimal") # step 2: @@ -155,20 +153,24 @@ def fixAllUrls(data, parser, encoding): data = url_suffix_regex.sub(repl, data) return data + def fixTagsOnly(data, parser, encoding): + print "Fixing tags" + soup = BeautifulSoup(data, parser) # TODO: replace beautifulsoup with lxml (still beats pyQuery, though) + # adjust all href attributes of html-link elements + for a in soup.findAll('a'): # for each element + if not abs_url_regex.search(a['href']): # that is not an absolute URL + new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', a['href']) # adjust href 1 (rss feed) + new_href = re.sub(r'/index\.html$', '/', new_href) # adjust href 2 (dir index), + print "\t", a['href'], "=>", new_href # brag about it, + a['href'] = a['href'].replace(a['href'], new_href) # perform replacement, and + return soup.prettify(encoding,formatter="html") # return pretty utf-8 html with encoded html entities + def fixUrls(data, parser, encoding): # Is this is a HTML document AND are we looking only for tags? if parser == 'lxml' and not args.replace: - soup = BeautifulSoup(data, parser) # TODO: replace beautifulsoup with lxml (still beats pyQuery, though) - # adjust all href attributes of html-link elements - for a in soup.findAll('a'): # for each element - if not abs_url_regex.search(a['href']): # that is not an absolute URL - new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', a['href']) # adjust href 1 (rss feed) - new_href = re.sub(r'/index\.html$', '/', new_href) # adjust href 2 (dir index), - print "\t", a['href'], "=>", new_href # brag about it, - a['href'] = a['href'].replace(a['href'], new_href) # perform replacement, and - return soup.prettify(encoding,formatter="html") # return pretty utf-8 html with encoded html entities - - # Otherwise, fall through to fixAllUrls() for all other cases + return fixTagsOnly(data, parser, encoding) + + # Otherwise, fall through to fixAllUrls() for all other cases (i.e. currently: replace all urls) # (XML needs to always go through here AND we want to replace all URLs) return fixAllUrls(data, parser, encoding) diff --git a/requirements.txt b/requirements.txt index 093f574..49042fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ GitPython==0.3.2.RC1 async==0.6.1 -docopt==0.6.1 gitdb==0.5.4 smmap==0.8.2 beautifulsoup4>=4.3.2 diff --git a/setup.py b/setup.py index a126752..ca5b886 100644 --- a/setup.py +++ b/setup.py @@ -15,5 +15,5 @@ license="MIT", packages=["buster"], entry_points={"console_scripts": ["buster = buster.buster:main"]}, - install_requires=['GitPython==0.3.2.RC1', 'async==0.6.1', 'docopt==0.6.1', 'gitdb==0.5.4', 'pyquery==1.2.8', 'smmap==0.8.2'] + install_requires=['GitPython==0.3.2.RC1', 'async==0.6.1', 'gitdb==0.5.4', 'smmap==0.8.2', 'lxml>=3.4.1', 'bs4>=4.3.2', 'io', 'argparse'] ) diff --git a/usage.txt b/usage.txt index e0e677c..2abc996 100644 --- a/usage.txt +++ b/usage.txt @@ -1,6 +1,6 @@ """Ghost Buster. Static site generator for Ghost. Usage: - buster setup [-h] [-p output/dir] repository + buster setup [-h] [--path output/dir] repository buster generate [-h] [--path output/dir] (--replace-all | --replace-tags) [source-url] [target-url] buster preview [-h] [--path [output/dir]] buster deploy [-h] [--path [output/dir]]