#!/usr/bin/env python
#
# tagclouds.py - v0.5
#
# Copyright (c) 2006 - Rich Burridge - Sun Microsystems Inc.
# All Rights Reserved.
#
# Script to extract the Technorati tags out of my Sun blog postings.
# These are then used to generate a simple tag cloud in HTML format
# that is written to standard output.
#
# Each of these tag words has a dynamic layer associated with it
# which contains a list of hypertext links for all of the blog posts
# that had that tag.
#
# The Blog postings have been saved into a directory on my local disk
# using the Grabber application in the BlogClientUI jar.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# TODO:
# > Improve the CSS to make the tag cloud appearance (and the popup link
# layer) look better.
# > Improve the JavaScript so that the layer appears on the screen without
# having to scroll using the keyboard arrow keys.
# > Improve the JavaScript so that the layer doesn't go away at inappropriate
# times (such as when you move the mouse out of it).
import os
import sys
import string
blogPostsDir = "/export/home/richb/CDROM/Blog/Backup/saved"
counts = {}
debug = False
fontSize = [12, 12, 16, 16, 20, 20, 24, 24, 30, 30]
tags = []
tagLinks = {}
def adjustTitle(title):
"""Adjust the blog title string to escape any quote characters.
Any single quotes found in the title string are escaped.
Any double quotes found in the title string are turned into single
quotes and escaped.
Arguments:
- string: the title string to be adjusted.
"""
newTitle = ""
for i in range(0, len(title)):
if title[i] == "'" or title[i] == "\"":
newTitle += "\\'"
else:
newTitle += title[i]
return newTitle
def createTotals(tags):
"""Process the list of sorted tags that were extracted from all the
blog postings and generate a count for each one, plus a list of HTML
links associated with each tag.
Arguments:
- tags: the sorted list of Technorati tags
"""
lastTag = ""
for i in range(0, len(tags)):
tag = tags[i][0]
title = adjustTitle(tags[i][1])
pathname = tags[i][2]
end = pathname.rfind("<")
filename = pathname[0:end]
link = "" + title + ""
if lastTag == tag:
count += 1
links.append(link)
else:
if lastTag:
counts[lastTag] = count
tagLinks[lastTag] = links
lastTag = tag
count = 1
links = [ link ]
def getTitle(line):
"""Extract the title of the blog post from this line. This will be
between the "
" and "" tags.
Arguments:
- line: the line from the blog post containing the title
"""
start = line.find("")
end = line.find("")
return(line[start+7:end])
def getTags(pathname):
"""Get all the Technorati tags for the blog posting in the given filename.
All the lines of that blog posting are read in, then split into tokens.
A search is made for the string "Tag:". If found, then the Technorati
tag is extracted. This will be in the subsequent token(s) on the same
line, up until a "<" character is found. After extraction, this tag is
added to the "tags" list.
Arguments:
- pathname: path name of the text file containing a blog posting
"""
if debug:
sys.stderr.write("Reading %s\n" % pathname)
fin = open(pathname, 'r')
lines = fin.readlines()
if debug:
sys.stderr.write("Read %d lines.\n" % len(lines))
for i in range(0, len(lines)):
# Get the title of the entry (apparently is always on the first line).
if i == 0:
title = getTitle(lines[i])
tokens = lines[i].split()
for j in range(0, len(tokens)):
if tokens[j].startswith("id=\"permalink"):
tmp = tokens[j]
start = tmp.find(">") + 1
end = tmp.find("<") + 1
entryName = tmp[start:end]
try:
if tokens[j] == "Tag:":
found = False
tagName = ""
while not found:
tmp = tokens[j+1]
if tmp.find("<") == -1:
tagName += tmp + " "
j += 1
else:
tagName += tmp[0:tmp.find("<")]
found = True
if debug:
sys.stderr.write("tagName: %s title: %s entry name: %s\n" \
% (tagName, title, entryName))
tags.append([tagName, title, entryName])
except:
sys.stderr.write("Problems reading %s\n" % pathname)
def getAllTags(dir):
"""Extract the Technorati tags from each of the files in the blog
posting directory. After all the files have been processed, sort
the tags and create totals for each tag name.
Arguments:
- dir: the blog posting directory,
"""
filenames = os.listdir(dir)
filenames.sort()
for filename in filenames:
pathname = dir + "/" + filename
if pathname.endswith(".html"):
if os.path.isfile(pathname):
getTags(pathname)
tags.sort()
createTotals(tags)
def writeTag(tag, count, links):
"""Write the HTML for this tag to standard output. The tag will have
a dynamic layer associated with it which will contain a list of the
blog links that contain this tag.
Arguments:
- tag: the name of the tag.
- count: the number of occurances of this tag.
- links: a list of blog links that have this tag.
"""
print " Links:" % tag,
for i in range(0, len(links)):
print "- %s" % links[i],
print "
');\" onMouseOut=\"OutLayer();\">",
if count < 5:
level = "level0"
elif count < 10:
level = "level1"
elif count < 15:
level = "level2"
elif count < 20:
level = "level3"
elif count < 25:
level = "level4"
elif count < 30:
level = "level5"
elif count < 35:
level = "level6"
elif count < 40:
level = "level7"
elif count < 45:
level = "level8"
else:
level = "level9"
print " %s \n" % (level, tag)
def makeTagCloud(counts):
"""Using the counts for each of the tags, write a simple HTML page to
standard output containing a tag cloud representation. The CSS
describes ten levels, each of which has differing font-size's,
line-height's and font-weight's.
Note that the dictionary keys are sorted before the tag cloud is
generated.
Arguments:
- counts: dictionary of tags and their counts.
"""
print ""
print ""
print "Rich Burridge's Blog's Tag Cloud"
print ""
print ""
print ""
print ""
print ""
print "
"
print ""
keys = counts.keys()
keys.sort()
for i in range(0, len(keys)):
tag = keys[i]
writeTag(tag, counts[tag], tagLinks[tag])
print "
"
print ""
print ""
def main():
getAllTags(blogPostsDir)
makeTagCloud(counts)
if __name__ == "__main__":
main()