#!/usr/bin/env python # # blog2pdf.py - v0.1 # # Copyright (c) 2006 - Rich Burridge - Sun Microsystems Inc. # All Rights Reserved. # # Script to convert my Sun blog postings into a single huge HTML file, # which is written to standard output. This could then be converted into # a PDF file using OpenOffice. # # The Blog postings have been saved into a directory on my local disk # using the Grabber application in the BlogClientUI jar. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published # by the Free Software Foundation; either version 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. import os import sys import string blogPostsDir = "/home/richb/CDROM/Blog/Backup/posts" debug = False months = [ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December" ] title = "Rich Burridge's Blog" def getTitle(line): """Extract the title of the blog post from this line. This will be between the "" and "" tags. Arguments: - line: the line from the blog post containing the title """ start = line.find("

") end = line.find("

") return(line[start+4:end]) def getUrl(line): """Extract the URL of the blog post from this line. This will be between the "permalink\\">" and "

" tags. Arguments: - line: the line from the blog post containing the URL """ start = line.find('permalink">') end = line.find("

") return(line[start+11:end]) def writeDateAndTime(filename): """Take a filename of the form YYYYMMDD-HHMM.html" and write out a date and time entry to standard output. Arguments: - pathname: file name of the text file containing a blog posting """ year = filename[0:4] month = filename[4:6] day = filename[6:8] hour = filename[9:11] min = filename[11:13] print " [%s %s, %s %s:%s]" % (months[int(month)-1], day, year, hour, min) def getContent(pathname, filename): """Get the HTML content for the blog posting in the given filename. All the lines of that blog posting are read in. The first and last ones are ignored. The blog posting title is read from the second line and the URL of the blog posting is read from the third one. These are used to create a hyperlink title for this post. The rest of the lines are just written straight out to standard out. Arguments: - pathname: full path name of the text file containing a blog posting - filename: file name of the text file containing a blog posting """ if debug: sys.stderr.write("Reading %s\n" % pathname) fin = open(pathname, 'r') lines = fin.readlines() if debug: sys.stderr.write("Read %d lines.\n" % len(lines)) title = getTitle(lines[1]) url = getUrl(lines[2]) print '

%s

' % (url, title) writeDateAndTime(filename) for i in range(3, len(lines)-1): print lines[i], print "
" def getAllContent(dir): """Extract the HTML content from each of the files in the blog posting directory. Ignore the "grabber-index.html" file. Arguments: - dir: the blog posting directory, """ filenames = os.listdir(dir) filenames.sort() for filename in filenames: pathname = dir + "/" + filename if filename.startswith("2") and filename.endswith(".html"): if os.path.isfile(pathname): getContent(pathname, filename) def writeIntro(): """Write the initial HTML content to standard output.""" print "%s" % title print "" def writeOutro(): """Write the final HTML content to standard output.""" print "

" def main(): writeIntro() getAllContent(blogPostsDir) writeOutro() if __name__ == "__main__": main()