#!/usr/bin/env python # # tagclouds.py - v0.5 # # Copyright (c) 2006 - Rich Burridge - Sun Microsystems Inc. # All Rights Reserved. # # Script to extract the Technorati tags out of my Sun blog postings. # These are then used to generate a simple tag cloud in HTML format # that is written to standard output. # # Each of these tag words has a dynamic layer associated with it # which contains a list of hypertext links for all of the blog posts # that had that tag. # # The Blog postings have been saved into a directory on my local disk # using the Grabber application in the BlogClientUI jar. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published # by the Free Software Foundation; either version 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # TODO: # > Improve the CSS to make the tag cloud appearance (and the popup link # layer) look better. # > Improve the JavaScript so that the layer appears on the screen without # having to scroll using the keyboard arrow keys. # > Improve the JavaScript so that the layer doesn't go away at inappropriate # times (such as when you move the mouse out of it). import os import sys import string blogPostsDir = "/export/home/richb/CDROM/Blog/Backup/saved" counts = {} debug = False fontSize = [12, 12, 16, 16, 20, 20, 24, 24, 30, 30] tags = [] tagLinks = {} def adjustTitle(title): """Adjust the blog title string to escape any quote characters. Any single quotes found in the title string are escaped. Any double quotes found in the title string are turned into single quotes and escaped. Arguments: - string: the title string to be adjusted. """ newTitle = "" for i in range(0, len(title)): if title[i] == "'" or title[i] == "\"": newTitle += "\\'" else: newTitle += title[i] return newTitle def createTotals(tags): """Process the list of sorted tags that were extracted from all the blog postings and generate a count for each one, plus a list of HTML links associated with each tag. Arguments: - tags: the sorted list of Technorati tags """ lastTag = "" for i in range(0, len(tags)): tag = tags[i][0] title = adjustTitle(tags[i][1]) pathname = tags[i][2] end = pathname.rfind("<") filename = pathname[0:end] link = "" + title + "" if lastTag == tag: count += 1 links.append(link) else: if lastTag: counts[lastTag] = count tagLinks[lastTag] = links lastTag = tag count = 1 links = [ link ] def getTitle(line): """Extract the title of the blog post from this line. This will be between the "" and "" tags. Arguments: - line: the line from the blog post containing the title """ start = line.find("") end = line.find("") return(line[start+7:end]) def getTags(pathname): """Get all the Technorati tags for the blog posting in the given filename. All the lines of that blog posting are read in, then split into tokens. A search is made for the string "Tag:". If found, then the Technorati tag is extracted. This will be in the subsequent token(s) on the same line, up until a "<" character is found. After extraction, this tag is added to the "tags" list. Arguments: - pathname: path name of the text file containing a blog posting """ if debug: sys.stderr.write("Reading %s\n" % pathname) fin = open(pathname, 'r') lines = fin.readlines() if debug: sys.stderr.write("Read %d lines.\n" % len(lines)) for i in range(0, len(lines)): # Get the title of the entry (apparently is always on the first line). if i == 0: title = getTitle(lines[i]) tokens = lines[i].split() for j in range(0, len(tokens)): if tokens[j].startswith("id=\"permalink"): tmp = tokens[j] start = tmp.find(">") + 1 end = tmp.find("<") + 1 entryName = tmp[start:end] try: if tokens[j] == "Tag:": found = False tagName = "" while not found: tmp = tokens[j+1] if tmp.find("<") == -1: tagName += tmp + " " j += 1 else: tagName += tmp[0:tmp.find("<")] found = True if debug: sys.stderr.write("tagName: %s title: %s entry name: %s\n" \ % (tagName, title, entryName)) tags.append([tagName, title, entryName]) except: sys.stderr.write("Problems reading %s\n" % pathname) def getAllTags(dir): """Extract the Technorati tags from each of the files in the blog posting directory. After all the files have been processed, sort the tags and create totals for each tag name. Arguments: - dir: the blog posting directory, """ filenames = os.listdir(dir) filenames.sort() for filename in filenames: pathname = dir + "/" + filename if pathname.endswith(".html"): if os.path.isfile(pathname): getTags(pathname) tags.sort() createTotals(tags) def writeTag(tag, count, links): """Write the HTML for this tag to standard output. The tag will have a dynamic layer associated with it which will contain a list of the blog links that contain this tag. Arguments: - tag: the name of the tag. - count: the number of occurances of this tag. - links: a list of blog links that have this tag. """ print "  Links:

');\" onMouseOut=\"OutLayer();\">", if count < 5: level = "level0" elif count < 10: level = "level1" elif count < 15: level = "level2" elif count < 20: level = "level3" elif count < 25: level = "level4" elif count < 30: level = "level5" elif count < 35: level = "level6" elif count < 40: level = "level7" elif count < 45: level = "level8" else: level = "level9" print " %s
\n" % (level, tag) def makeTagCloud(counts): """Using the counts for each of the tags, write a simple HTML page to standard output containing a tag cloud representation. The CSS describes ten levels, each of which has differing font-size's, line-height's and font-weight's. Note that the dictionary keys are sorted before the tag cloud is generated. Arguments: - counts: dictionary of tags and their counts. """ print "" print "" print "Rich Burridge's Blog's Tag Cloud" print "" print "" print "" print "" print "
" print "
" print "
" keys = counts.keys() keys.sort() for i in range(0, len(keys)): tag = keys[i] writeTag(tag, counts[tag], tagLinks[tag]) print "
" print "" print "" def main(): getAllTags(blogPostsDir) makeTagCloud(counts) if __name__ == "__main__": main()