#!/usr/bin/python # -*- encoding: UTF-8 -*- import sys import codecs #tags = ('S', 'B', 'B2', 'B3', 'M', 'E') def process_word(word): l = len(word) if l == 1: print word.encode('UTF-8'), 'S' return for i in xrange (0, l-1): c = word[i].encode('UTF-8') if i == 0: print c, 'B' elif i == 1: print c, 'B2' elif i == 2: print c, 'B3' else: print c, 'M' print word[l-1].encode('UTF-8'), 'E' def process_file(file): for line in file: for word in line.strip().split(): process_word(word) print if __name__ == "__main__": try: file = codecs.open(sys.argv[1], "r", "utf-8") except: print "corpus file is not specified, or open failed!" sys.exit() process_file(file) file.close()