#!/usr/bin/python # -*- encoding: UTF-8 -*- import sys import codecs window = [('_B','E')] def process_window(): global window tag = window[1][1] c0 = window[1][0] if c0 != 'E_': tp = window[0][1] cp = window[0][0] cn = window[2][0] U00 = 'U00-%s' % cp U01 = 'U01-%s' % c0 U02 = 'U02-%s' % cn U03 = 'U03-%s/%s' % (cp, c0) U04 = 'U04-%s/%s' % (c0, cn) U05 = 'U05-%s/%s' % (cp, cn) event=(tag, U00, U01, U02, U03, U04, U05, tp) print ' '.join(event).encode('UTF-8') del window[0] def process_file(file): global window for line in file: if line[0] == '\n': word, tag = 'E_', 'E' else: word, tag = line.strip().split() if len(window) == 3: process_window() window.append ((word, tag)) if __name__ == "__main__": try: file = codecs.open(sys.argv[1], "r", "utf-8") except: print "corpus file is not specified, or open failed!" sys.exit() process_file(file) file.close()