#!/usr/bin/env python
#
# blog2pdf.py - v0.1
#
# Copyright (c) 2006 - Rich Burridge - Sun Microsystems Inc.
# All Rights Reserved.
#
# Script to convert my Sun blog postings into a single huge HTML file,
# which is written to standard output. This could then be converted into
# a PDF file using OpenOffice.
#
# The Blog postings have been saved into a directory on my local disk
# using the Grabber application in the BlogClientUI jar.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
import os
import sys
import string
blogPostsDir = "/home/richb/CDROM/Blog/Backup/posts"
debug = False
months = [ "January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December" ]
title = "Rich Burridge's Blog"
def getTitle(line):
"""Extract the title of the blog post from this line. This will be
between the "
" and "" tags.
Arguments:
- line: the line from the blog post containing the title
"""
start = line.find("")
end = line.find("
")
return(line[start+4:end])
def getUrl(line):
"""Extract the URL of the blog post from this line. This will be
between the "permalink\\">" and "" tags.
Arguments:
- line: the line from the blog post containing the URL
"""
start = line.find('permalink">')
end = line.find("")
return(line[start+11:end])
def writeDateAndTime(filename):
"""Take a filename of the form YYYYMMDD-HHMM.html" and write out
a date and time entry to standard output.
Arguments:
- pathname: file name of the text file containing a blog posting
"""
year = filename[0:4]
month = filename[4:6]
day = filename[6:8]
hour = filename[9:11]
min = filename[11:13]
print " [%s %s, %s %s:%s]" % (months[int(month)-1], day, year, hour, min)
def getContent(pathname, filename):
"""Get the HTML content for the blog posting in the given filename.
All the lines of that blog posting are read in. The first and last ones
are ignored. The blog posting title is read from the second line and
the URL of the blog posting is read from the third one. These are used
to create a hyperlink title for this post. The rest of the lines are
just written straight out to standard out.
Arguments:
- pathname: full path name of the text file containing a blog posting
- filename: file name of the text file containing a blog posting
"""
if debug:
sys.stderr.write("Reading %s\n" % pathname)
fin = open(pathname, 'r')
lines = fin.readlines()
if debug:
sys.stderr.write("Read %d lines.\n" % len(lines))
title = getTitle(lines[1])
url = getUrl(lines[2])
print '' % (url, title)
writeDateAndTime(filename)
for i in range(3, len(lines)-1):
print lines[i],
print "
"
def getAllContent(dir):
"""Extract the HTML content from each of the files in the blog posting
directory. Ignore the "grabber-index.html" file.
Arguments:
- dir: the blog posting directory,
"""
filenames = os.listdir(dir)
filenames.sort()
for filename in filenames:
pathname = dir + "/" + filename
if filename.startswith("2") and filename.endswith(".html"):
if os.path.isfile(pathname):
getContent(pathname, filename)
def writeIntro():
"""Write the initial HTML content to standard output."""
print "%s" % title
print ""
def writeOutro():
"""Write the final HTML content to standard output."""
print ""
def main():
writeIntro()
getAllContent(blogPostsDir)
writeOutro()
if __name__ == "__main__":
main()