#!/usr/bin/env python
from __future__ import print_function
##### Configuration ##############################
import json
CONFIG = json.load(open('config.json'))
## NOTES
## 1. This assumes that you have already created the S3 bucket whose name
## is stored in AWS_S3_BUCKET_NAME environment variable.
## 2. Under that S3 bucket, you have created a folder whose name is stored
## above as SHORT_PROJECT_NAME.
## 3. Under that S3 bucket, you have created a folder whose name is stored as
## SHORT_PROJECT_NAME/assets.
##### Imports ####################################
import os
import datetime
import subprocess
import copy
from xmlrpclib import ServerProxy
import boto
import boto.s3.bucket
import boto.s3.key
from bs4 import BeautifulSoup
from fabric.api import task, local
##### Start with checks ##########################
for chapter in CONFIG['MARKDOWN_FILES']:
assert (chapter['slug'].lower() == chapter['slug']), \
"Slug must be lower case : {}".format(chapter['slug'])
if str(os.environ.get('AWS_ENABLED')).lower() == 'false':
AWS_ENABLED = False
elif os.environ.get('AWS_ACCESS_KEY_ID') is not None \
and len(os.environ['AWS_ACCESS_KEY_ID']) > 0 \
and os.environ.get('AWS_SECRET_ACCESS_KEY') is not None \
and len(os.environ['AWS_SECRET_ACCESS_KEY']) > 0 \
and os.environ.get('AWS_S3_BUCKET_NAME') is not None \
and len(os.environ['AWS_S3_BUCKET_NAME']) > 0:
AWS_ENABLED = True
else:
AWS_ENABLED = False
print("NOTE: S3 uploading is disabled because of missing " +
"AWS key environment variables.")
# In my case, they are the same - 'files.swaroopch.com'
# http://docs.amazonwebservices.com/AmazonS3/latest/dev/VirtualHosting.html#VirtualHostingCustomURLs
if AWS_ENABLED:
S3_PUBLIC_URL = os.environ['AWS_S3_BUCKET_NAME']
#else
#S3_PUBLIC_URL = 's3.amazonaws.com/{}'.format(
#os.environ['AWS_S3_BUCKET_NAME'])
if os.environ.get('WORDPRESS_RPC_URL') is not None \
and len(os.environ['WORDPRESS_RPC_URL']) > 0 \
and os.environ.get('WORDPRESS_BASE_URL') is not None \
and len(os.environ['WORDPRESS_BASE_URL']) > 0 \
and os.environ.get('WORDPRESS_BLOG_ID') is not None \
and len(os.environ['WORDPRESS_BLOG_ID']) > 0 \
and os.environ.get('WORDPRESS_USERNAME') is not None \
and len(os.environ['WORDPRESS_USERNAME']) > 0 \
and os.environ.get('WORDPRESS_PASSWORD') is not None \
and len(os.environ['WORDPRESS_PASSWORD']) > 0 \
and os.environ.get('WORDPRESS_PARENT_PAGE_ID') is not None \
and len(os.environ['WORDPRESS_PARENT_PAGE_ID']) > 0 \
and os.environ.get('WORDPRESS_PARENT_PAGE_SLUG') is not None \
and len(os.environ['WORDPRESS_PARENT_PAGE_SLUG']) > 0:
WORDPRESS_ENABLED = True
else:
WORDPRESS_ENABLED = False
print("NOTE: Wordpress uploading is disabled because of " +
"missing environment variables.")
##### Helper methods #############################
def _upload_to_s3(filename, key):
"""http://docs.pythonboto.org/en/latest/s3_tut.html#storing-data"""
conn = boto.connect_s3()
b = boto.s3.bucket.Bucket(conn, os.environ['AWS_S3_BUCKET_NAME'])
k = boto.s3.key.Key(b)
k.key = key
k.set_contents_from_filename(filename)
k.set_acl('public-read')
url = 'http://{}/{}'.format(S3_PUBLIC_URL, key)
print("Uploaded to S3 : {}".format(url))
return url
def upload_output_to_s3(filename):
key = "{}/{}".format(CONFIG['SHORT_PROJECT_NAME'],
filename.split('/')[-1])
return _upload_to_s3(filename, key)
def upload_asset_to_s3(filename):
key = "{}/assets/{}".format(CONFIG['SHORT_PROJECT_NAME'],
filename.split('/')[-1])
return _upload_to_s3(filename, key)
def replace_images_with_s3_urls(text):
"""http://www.crummy.com/software/BeautifulSoup/bs4/doc/"""
soup = BeautifulSoup(text)
for image in soup.find_all('img'):
image['src'] = upload_asset_to_s3(image['src'])
return unicode(soup)
def markdown_to_html(source_text, upload_assets_to_s3=False):
"""Convert from Markdown to HTML; optional: upload images, etc. to S3."""
args = ['pandoc',
'-f', 'markdown',
'-t', 'html5']
p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
output = p.communicate(source_text)[0]
# http://wordpress.org/extend/plugins/raw-html/
output = '\n' + output + '\n'
# NOTE: Also assumes that you have added the CSS from
# `pandoc -S -t html5` to the `style.css` of your active Wordpress theme.
if upload_assets_to_s3:
output = replace_images_with_s3_urls(output)
return output
def _wordpress_get_pages():
server = ServerProxy(os.environ['WORDPRESS_RPC_URL'])
print("Fetching list of pages from WordPress")
return server.wp.getPosts(os.environ['WORDPRESS_BLOG_ID'],
os.environ['WORDPRESS_USERNAME'],
os.environ['WORDPRESS_PASSWORD'],
{
'post_type': 'page',
'number': pow(10, 5),
})
def wordpress_new_page(slug, title, content):
"""Create a new Wordpress page.
https://codex.wordpress.org/XML-RPC_WordPress_API/Posts#wp.newPost
https://codex.wordpress.org/Function_Reference/wp_insert_post
http://docs.python.org/library/xmlrpclib.html
"""
server = ServerProxy(os.environ['WORDPRESS_RPC_URL'])
return server.wp.newPost(os.environ['WORDPRESS_BLOG_ID'],
os.environ['WORDPRESS_USERNAME'],
os.environ['WORDPRESS_PASSWORD'],
{
'post_name': slug,
'post_content': content,
'post_title': title,
'post_parent':
os.environ['WORDPRESS_PARENT_PAGE_ID'],
'post_type': 'page',
'post_status': 'publish',
'comment_status': 'closed',
'ping_status': 'closed',
})
def wordpress_edit_page(post_id, title, content):
"""Edit a Wordpress page.
https://codex.wordpress.org/XML-RPC_WordPress_API/Posts#wp.editPost
https://codex.wordpress.org/Function_Reference/wp_insert_post
http://docs.python.org/library/xmlrpclib.html
"""
server = ServerProxy(os.environ['WORDPRESS_RPC_URL'])
return server.wp.editPost(os.environ['WORDPRESS_BLOG_ID'],
os.environ['WORDPRESS_USERNAME'],
os.environ['WORDPRESS_PASSWORD'],
post_id,
{
'post_content': content,
'post_title': title,
})
def collect_header_anchors(chapter, i, all_headers):
soup = BeautifulSoup(chapter['html'])
for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
if 'id' in header.attrs:
all_headers[header['id']] = i
def fix_links_to_other_chapters(chapter, chapters, all_headers):
"""Fix links to other sections with Wordpress page URL."""
soup = BeautifulSoup(chapter['html'])
for link in soup.find_all('a'):
if 'href' in link.attrs:
if link['href'].startswith('#'):
header_id = link['href'][1:]
assert header_id in all_headers, \
"#{} does not exist, referred in {}".format(
header_id, chapter['file'])
other_chapter = chapters[all_headers[header_id]]
link['href'] = '{}#{}'.format(
other_chapter['link'],
header_id)
chapter['html'] = unicode(soup)
def add_previous_next_links(chapter, i, chapters):
previous_link = None
if i > 0:
previous_link = chapters[i - 1]['link']
next_link = None
if i < len(chapters) - 1:
next_link = chapters[i + 1]['link']
if previous_link is not None or next_link is not None:
chapter['html'] += "\n"
if previous_link is not None:
chapter['html'] += """\
⇐ Previous chapter\
""".format(previous_link)
if previous_link is not None and next_link is not None:
chapter['html'] += ' ' * 5
if next_link is not None:
chapter['html'] += """\
Next chapter ⇒\
""".format(next_link)
##### Tasks ######################################
@task
def prepare():
frontpage = CONFIG['MARKDOWN_FILES'][0]
content = open(frontpage['file']).read()
content = content.replace("$$date$$",
datetime.datetime.now().strftime("%d %b %Y"))
with open(frontpage['file'], 'w') as output:
output.write(content)
@task
def wp():
"""https://codex.wordpress.org/XML-RPC_WordPress_API/Posts"""
if WORDPRESS_ENABLED:
chapters = copy.deepcopy(CONFIG['MARKDOWN_FILES'])
# header anchor id -> index in MARKDOWN_FILES
all_headers = {}
# Render html
print("Rendering html")
for (i, chapter) in enumerate(chapters):
chapter['html'] = markdown_to_html(open(chapter['file']).read(),
upload_assets_to_s3=AWS_ENABLED)
collect_header_anchors(chapter, i, all_headers)
chapter['link'] = "{}/{}/{}".format(
os.environ['WORDPRESS_BASE_URL'],
os.environ['WORDPRESS_PARENT_PAGE_SLUG'],
chapter['slug'])
# Fix cross-links
for chapter in chapters:
fix_links_to_other_chapters(chapter, chapters, all_headers)
# Add previous and next links at end of html
for (i, chapter) in enumerate(chapters):
add_previous_next_links(chapter, i, chapters)
# Fetch list of pages on the server and determine which already exist
existing_pages = _wordpress_get_pages()
existing_page_slugs = [i.get('post_name') for i in existing_pages]
def page_slug_to_id(slug):
pages = [i for i in existing_pages if i.get('post_name') == slug]
page = pages[0]
return page['post_id']
for chapter in chapters:
if chapter['slug'] in existing_page_slugs:
chapter['page_id'] = page_slug_to_id(chapter['slug'])
# Send to WP
print("Uploading to WordPress")
for chapter in chapters:
if chapter['slug'] in existing_page_slugs:
print("Existing page: {}".format(chapter['link']))
assert wordpress_edit_page(chapter['page_id'],
chapter['title'],
chapter['html'])
else:
print("New page: {}".format(chapter['link']))
assert wordpress_new_page(chapter['slug'],
chapter['title'],
chapter['html'])
@task
def html():
"""HTML5 output."""
args = ['pandoc',
'-f', 'markdown',
'-t', 'html5',
'-o', '{}.html'.format(CONFIG['FULL_PROJECT_NAME']),
'-s',
'--toc'] + [i['file'] for i in CONFIG['MARKDOWN_FILES']]
local(' '.join(args))
local('firefox -new-tab {}.html'.format(CONFIG['FULL_PROJECT_NAME']))
@task
def epub():
"""http://johnmacfarlane.net/pandoc/epub.html"""
args = ['pandoc',
'-f', 'markdown',
'-t', 'epub',
'-o', '{}.epub'.format(CONFIG['FULL_PROJECT_NAME'])] + \
[i['file'] for i in CONFIG['MARKDOWN_FILES']]
# TODO --epub-cover-image
# TODO --epub-metadata
# TODO --epub-stylesheet
local(' '.join(args))
if AWS_ENABLED:
upload_output_to_s3('{}.epub'.format(CONFIG['FULL_PROJECT_NAME']))
@task
def pdf():
"""http://johnmacfarlane.net/pandoc/README.html#creating-a-pdf"""
args = ['pandoc',
'-f', 'markdown',
# https://github.com/jgm/pandoc/issues/571
#'-t', 'pdf',
'-o', '{}.pdf'.format(CONFIG['FULL_PROJECT_NAME']),
'--toc'] + [i['file'] for i in CONFIG['MARKDOWN_FILES']]
local(' '.join(args))
if AWS_ENABLED:
upload_output_to_s3('{}.pdf'.format(CONFIG['FULL_PROJECT_NAME']))
@task
def push():
wp()
epub()
pdf()
@task
def docx():
"""OOXML document format."""
args = ['pandoc',
'-f', 'markdown',
'-t', 'docx',
'-o', '{}.docx'.format(CONFIG['FULL_PROJECT_NAME'])] + \
[i['file'] for i in CONFIG['MARKDOWN_FILES']]
local(' '.join(args))
if AWS_ENABLED:
upload_output_to_s3('{}.docx'.format(CONFIG['FULL_PROJECT_NAME']))
@task
def odt():
"""OpenDocument document format."""
args = ['pandoc',
'-f', 'markdown',
'-t', 'odt',
'-o', '{}.odt'.format(CONFIG['FULL_PROJECT_NAME'])] + \
[i['file'] for i in CONFIG['MARKDOWN_FILES']]
local(' '.join(args))
if AWS_ENABLED:
upload_output_to_s3('{}.odt'.format(CONFIG['FULL_PROJECT_NAME']))
@task
def clean():
"""Remove generated output files"""
possible_outputs = (
'{}.html'.format(CONFIG['FULL_PROJECT_NAME']),
'{}.epub'.format(CONFIG['FULL_PROJECT_NAME']),
'{}.pdf'.format(CONFIG['FULL_PROJECT_NAME']),
'{}.docx'.format(CONFIG['FULL_PROJECT_NAME']),
'{}.odt'.format(CONFIG['FULL_PROJECT_NAME']),
)
for filename in possible_outputs:
if os.path.exists(filename):
os.remove(filename)
print("Removed {}".format(filename))