package optimizer; /** * * @author Mojtaba Khallash * * This class generates the "validateFormat.py" and "SharedTaskCommon.py" file */ public class ValidationGenerator { public static String generateValidateFormat() { StringBuilder validate = new StringBuilder(); validate.append("#!/usr/local/bin/python\n\n"); validate.append("# (C)Copyright 2006, TOSHIBA Corporation, all rights reserved.\n\n"); validate.append("'''\n"); validate.append("Script for checking whether a file is in CoNLL-X shared task format\n"); validate.append("'''\n\n"); validate.append("# still to do:\n"); validate.append("# check that either all or none of PHEAD and PDEPREL are dummy value\n"); validate.append("### would not work for Spanish data...\n"); validate.append("# check that PHEAD is really projective\n\n\n"); validate.append("__author__ = 'Sabine Buchholz <sabine dot buchholz at crl dot toshiba dot co dot uk>'\n"); validate.append("__version__ = '$Id: validateFormat.py,v 1.4 2006/03/08 17:30:50 sabine Exp $'\n\n"); validate.append("import sys\n"); validate.append("import string\n"); validate.append("import os.path\n"); validate.append("import codecs, re\n\n"); validate.append("try:\n"); validate.append(" import optparse\n"); validate.append("except ImportError:\n"); validate.append(" print >>sys.stderr, \\\n"); validate.append(" \"You need Python version 2.3 or later; your version is %s\" % \\\n"); validate.append(" sys.version\n"); validate.append(" raise\n\n"); validate.append("from SharedTaskCommon import \\\n"); validate.append(" emptyProjColumnString, handleProblem, \\\n"); validate.append(" checkCycles_tmp2, checkCyclesPhead, Terminal\n\n"); validate.append("def validate(infile,instream,outstream, options):\n"); validate.append(" global exit_status\n\n"); validate.append(" # initialize\n"); validate.append(" line_number = 0\n"); validate.append(" (idmax, headmax, pheadmax) = (0, 0, 0)\n"); validate.append(" token_list = ['dummy'] # the 0'th element\n"); validate.append(" rootLines = []\n"); validate.append(" error_flag = 0\n"); validate.append(" sent_start = 1\n\n"); validate.append(" for line in instream:\n"); validate.append(" line_number += 1\n"); validate.append(" # empty line ends sentence\n"); validate.append(" if re.compile(u'^\\s*$').search(line):\n"); validate.append(" check_sentence(infile, options,\n"); validate.append(" sent_start,\n"); validate.append(" rootLines, token_list,\n"); validate.append(" error_flag,\n"); validate.append(" idmax, headmax, pheadmax)\n"); validate.append(" # re-initialize\n"); validate.append(" (idmax, headmax, pheadmax) = (0, 0, 0)\n"); validate.append(" token_list = ['dummy'] # the 0'th element\n"); validate.append(" rootLines = []\n"); validate.append(" error_flag = 0\n"); validate.append(" sent_start = line_number+1 # line where next sentence starts\n\n"); validate.append(" # non-empty line, i.e. token\n"); validate.append(" else:\n"); validate.append(" if options.input_sep == ' +': # if separator is spaces\n"); validate.append(" line = line.strip() # remove leading and trailing whitespace\n"); validate.append(" else:\n"); validate.append(" line = line.rstrip() # remove trailing whitespace (e.g. \\r, \\n)\n"); validate.append(" # split using input_sep regular expression\n"); validate.append(" fields = re.compile(options.input_sep).split(line)\n\n"); validate.append(" if len(fields) < minNumCols:\n"); validate.append(" msg = \"%s: Error: line %d: Too few columns (%d<%d):\\n\\t%s\" % \\\n"); validate.append(" (infile,line_number,len(fields),minNumCols,line)\n"); validate.append(" print >>sys.stderr, msg.encode(options.encoding)\n"); validate.append(" error_flag = 1\n"); validate.append(" exit_status = 1\n"); validate.append(" terminal = 'dummy'\n"); validate.append(" elif len(fields) > maxNumCols:\n"); validate.append(" msg = \"%s: Error: line %d: Too many columns (%d>%d):\\n\\t%s\" % \\\n"); validate.append(" (infile,line_number,len(fields),maxNumCols,line)\n"); validate.append(" print >>sys.stderr, msg.encode(options.encoding)\n"); validate.append(" error_flag = 1\n"); validate.append(" exit_status = 1\n"); validate.append(" terminal = 'dummy'\n"); validate.append(" else:\n"); validate.append(" (terminal, error_flag, idmax,\n"); validate.append(" headmax, pheadmax) = check_line(infile,line_number,line,\n"); validate.append(" options, rootLines,\n"); validate.append(" fields,token_list,\n"); validate.append(" error_flag,\n"); validate.append(" idmax,headmax,pheadmax)\n\n"); validate.append(" token_list.append(terminal)\n\n"); validate.append(" if len(token_list) > 1: # more than just dummy\n"); validate.append(" # i.e. some unprocessed sentence is left\n"); validate.append(" msg = \"line %d: No empty line after last sentence\" % \\\n"); validate.append(" (line_number)\n"); validate.append(" handleProblem(infile, 'whitespace', msg, options)\n\n"); validate.append(" check_sentence(infile, options,\n"); validate.append(" sent_start,\n"); validate.append(" rootLines, token_list,\n"); validate.append(" error_flag,\n"); validate.append(" idmax, headmax, pheadmax)\n\n\n"); validate.append("def check_sentence(infile, options,\n"); validate.append(" sent_start,\n"); validate.append(" rootLines, token_list,\n"); validate.append(" error_flag,\n"); validate.append(" idmax, headmax, pheadmax):\n"); validate.append(" global exit_status\n\n"); validate.append(" # check that there are tokens, i.e. not two\n"); validate.append(" # empty lines following each other\n"); validate.append(" if len(token_list) == 1: # just dummy\n"); validate.append(" msg = \"line %d: More than one empty line separating sentences\" % \\\n"); validate.append(" (sent_start)\n"); validate.append(" handleProblem(infile, 'whitespace', msg, options)\n\n"); validate.append(" else:\n"); validate.append(" if options.datatype == 'train' or options.datatype == 'system':\n"); validate.append(" if error_flag == 0: # only check if no error occurred so far\n"); validate.append(" # check that there is at least one root\n"); validate.append(" if len(rootLines) == 0:\n"); validate.append(" msg = \"%s: Error: line %dff: no token has HEAD=0\" % \\\n"); validate.append(" (infile, sent_start)\n"); validate.append(" print >>sys.stderr, msg.encode(options.encoding)\n"); validate.append(" error_flag = 1\n"); validate.append(" if options.datatype == 'train':\n"); validate.append(" exit_status = 1\n"); validate.append(" # else: system submissions: print but accept\n\n"); validate.append(" ## # check that there is exactly one root (option???)\n"); validate.append(" ## if len(rootLines) > 1:\n"); validate.append(" ## msg = \"%s line %dff: Warning: several tokens have HEAD=0:\\n\\t%s\" % \\\n"); validate.append(" ## (infile, sent_start,\"\\n\\t\".join(rootLines))\n"); validate.append(" ## print >>sys.stderr, msg.encode(options.encoding)\n\n"); validate.append(" # check that HEAD and PHEAD are not higher than highest ID\n"); validate.append(" if headmax > idmax:\n"); validate.append(" msg = \"%s: Error: line %dff: too big HEAD value (%d>%d)\" % \\\n"); validate.append(" (infile, sent_start, headmax, idmax)\n"); validate.append(" print >>sys.stderr, msg.encode(options.encoding)\n"); validate.append(" error_flag = 1\n"); validate.append(" if options.datatype == 'train':\n"); validate.append(" exit_status = 1\n"); validate.append(" # else: system submissions: print but accept\n"); validate.append(" if pheadmax > idmax:\n"); validate.append(" msg = \"%s: Error: line %dff: too big PHEAD value (%d>%d)\" % \\\n"); validate.append(" (infile, sent_start, pheadmax, idmax)\n"); validate.append(" print >>sys.stderr, msg.encode(options.encoding)\n"); validate.append(" error_flag = 1\n"); validate.append(" if options.datatype == 'train':\n"); validate.append(" exit_status = 1\n"); validate.append(" # else: system submissions: print but accept\n\n"); validate.append(" # if necessary, do punctuation checks\n"); validate.append(" if error_flag == 0: # only check if no error occurred so far\n"); validate.append(" if options.punctPostag != '':\n"); validate.append(" # a value is given, so punctuation must be checked\n"); validate.append(" punctRe = re.compile('^'+options.punctPostag+'$')\n"); validate.append(" non_punct_count = 0 # how many tokens are not punctuation\n"); validate.append(" for i in range(1,len(token_list)):\n"); validate.append(" if not punctRe.search(token_list[i].cpostag):\n"); validate.append(" # is not punctuation\n"); validate.append(" non_punct_count += 1\n"); validate.append(" headID = token_list[i].head\n"); validate.append(" if headID != 0: # not root (cannot link to punctuation anyway)\n"); validate.append(" if punctRe.search(token_list[headID].cpostag):\n"); validate.append(" # links to punctuation\n"); validate.append(" msg = \"line %dff: token %d (%s) links to punctuation\" % \\\n"); validate.append(" (sent_start, i, token_list[i].form)\n"); validate.append(" handleProblem(infile, 'punct', msg, options)\n"); validate.append(" # this assumes that punctuation linking\n"); validate.append(" # to punctuation is fine\n"); validate.append(" if non_punct_count == 0:\n"); validate.append(" msg = \"line %dff: only punctuation tokens in sentence\" % \\\n"); validate.append(" (sent_start)\n"); validate.append(" handleProblem(infile, 'punct', msg, options)\n\n\n"); validate.append(" # check for dependency cycles\n"); validate.append(" if options.datatype == 'train' or options.datatype == 'system':\n"); validate.append(" if error_flag == 0: # only check if no error occurred so far\n"); validate.append(" checkCycles_tmp2(\"%s line %dff\" % (infile, sent_start),\n"); validate.append(" options, token_list, options.rootDeprel)\n"); validate.append(" checkCyclesPhead(\"%s line %dff\" % (infile, sent_start),\n"); validate.append(" options, token_list, options.rootDeprel)\n\n\n"); validate.append("def check_line(infile,line_number,line,\n"); validate.append(" options, rootLines,\n"); validate.append(" fields,token_list,\n"); validate.append(" error_flag,\n"); validate.append(" idmax,headmax,pheadmax):\n"); validate.append(" global exit_status\n\n"); validate.append(" if options.datatype == 'train':\n"); validate.append(" (id, form, lemma, cpostag, postag,\n"); validate.append(" feats, head, deprel, phead, pdeprel) = fields\n"); validate.append(" elif options.datatype == 'test_blind':\n"); validate.append(" (id, form, lemma, cpostag, postag, feats) = fields\n"); validate.append(" (head, deprel, phead, pdeprel) = (u'0',emptyProjColumnString,\n"); validate.append(" emptyProjColumnString,emptyProjColumnString)\n"); validate.append(" elif options.datatype == 'system':\n"); validate.append(" (id, form, lemma, cpostag, postag,\n"); validate.append(" feats, head, deprel) = fields[0:8]\n"); validate.append(" if len(fields) == 8:\n"); validate.append(" (phead, pdeprel) = (emptyProjColumnString,emptyProjColumnString)\n"); validate.append(" elif len(fields) == 9:\n"); validate.append(" phead = fields[8]\n"); validate.append(" elif len(fields) == 10:\n"); validate.append(" (phead, pdeprel) = fields[8:10]\n\n"); validate.append(" # check that ID is integer > 0\n"); validate.append(" if id != u'0' and re.compile(u'^[0-9]+$').search(id):\n"); validate.append(" id = int(id)\n"); validate.append(" if id > idmax:\n"); validate.append(" idmax = id\n"); validate.append(" # check that ID is consecutive\n"); validate.append(" if id != len(token_list):\n"); validate.append(" msg = \"%s: Error: line %d: Non-consecutive value for ID column (%d!=%d):\\n\\t%s\" % \\\n"); validate.append(" (infile,line_number,id,len(token_list),line)\n"); validate.append(" print >>sys.stderr, msg.encode(options.encoding)\n"); validate.append(" error_flag = 1\n"); validate.append(" exit_status = 1\n"); validate.append(" else:\n"); validate.append(" msg = \"%s: Error: line %d: Illegal value for ID column:\\n\\t%s\" % \\\n"); validate.append(" (infile,line_number,line)\n"); validate.append(" print >>sys.stderr, msg.encode(options.encoding)\n"); validate.append(" error_flag = 1\n"); validate.append(" exit_status = 1\n\n"); validate.append(" if options.datatype == 'train' or options.datatype == 'system':\n"); validate.append(" # check that PHEAD is emptyProjColumnString or integer >= 0\n"); validate.append(" if phead != emptyProjColumnString:\n"); validate.append(" if re.compile(u'^[0-9]+$').search(phead):\n"); validate.append(" phead = int(phead)\n"); validate.append(" if phead > pheadmax:\n"); validate.append(" pheadmax = phead\n"); validate.append(" else:\n"); validate.append(" msg = \"%s: Error: line %d: Illegal value for PHEAD column:\\n\\t%s\" % \\\n"); validate.append(" (infile,line_number,line)\n"); validate.append(" print >>sys.stderr, msg.encode(options.encoding)\n"); validate.append(" error_flag = 1\n"); validate.append(" if options.datatype == 'train':\n"); validate.append(" exit_status = 1\n"); validate.append(" # else: system submissions: print but accept\n\n"); validate.append(" # check that HEAD is integer >= 0\n"); validate.append(" if re.compile(u'^[0-9]+$').search(head):\n"); validate.append(" head = int(head)\n"); validate.append(" if head > headmax:\n"); validate.append(" headmax = head\n"); validate.append(" if options.rootDeprel != '' and deprel == options.rootDeprel and head != 0:\n"); validate.append(" # check that HEAD is 0 if DEPREL is options.rootDeprel\n"); validate.append(" msg = (\"line %d: HEAD is not 0:\\n\\t%s\" % \\\n"); validate.append(" (line_number,line))\n"); validate.append(" handleProblem(infile, 'root', msg, options)\n"); validate.append(" if head == 0: # root\n"); validate.append(" # check that DEPREL is options.rootDeprel if HEAD is 0\n"); validate.append(" if options.rootDeprel != '' and deprel != options.rootDeprel:\n"); validate.append(" msg = \"line %d: DEPREL is not %s:\\n\\t%s\" % \\\n"); validate.append(" (line_number,options.rootDeprel,line)\n"); validate.append(" handleProblem(infile, 'root', msg, options)\n"); validate.append(" rootLines.append(line)\n\n"); validate.append(" else:\n"); validate.append(" msg = \"%s: Error: line %d: Illegal value for HEAD column:\\n\\t%s\" % \\\n"); validate.append(" (infile,line_number,line)\n"); validate.append(" print >>sys.stderr, msg.encode(options.encoding)\n"); validate.append(" error_flag = 1\n"); validate.append(" if options.datatype == 'train':\n"); validate.append(" exit_status = 1\n"); validate.append(" # else: system submissions: print but accept\n\n"); validate.append(" # check that other fields are not empty\n"); validate.append(" # (can occur with tab but not with spaces as separator)\n"); validate.append(" if (len(form) == 0 or len(lemma) == 0 or\n"); validate.append(" len(cpostag) == 0 or len(postag) == 0 or\n"); validate.append(" len(feats) == 0 or len(deprel) == 0 or len(pdeprel) == 0):\n"); validate.append(" msg = \"line %d: At least one column value is the empty string:\\n\\t%s\" % \\\n"); validate.append(" (line_number,line)\n"); validate.append(" handleProblem(infile, 'other', msg, options)\n\n"); validate.append(" # check that other fields do not contain whitespace\n"); validate.append(" ws = re.compile('\\s')\n"); validate.append(" if (ws.search(form) or ws.search(lemma) or\n"); validate.append(" ws.search(cpostag) or ws.search(postag) or\n"); validate.append(" ws.search(feats) or ws.search(deprel) or ws.search(pdeprel)):\n"); validate.append(" msg = \"line %d: At least one column value contains whitespace:\\n\\t%s\" % \\\n"); validate.append(" (line_number,line)\n"); validate.append(" handleProblem(infile, 'whitespace', msg, options)\n\n\n"); validate.append(" terminal = Terminal(id,form,lemma,cpostag,postag,feats,deprel, phead,pdeprel)\n"); validate.append(" terminal.head = head # change class???\n"); validate.append(" return (terminal, error_flag, idmax, headmax, pheadmax)\n\n\n\n"); validate.append("usage = \\\n"); validate.append("\"\"\"\n"); validate.append(" %prog [options] FILES\n\n"); validate.append("purpose:\n"); validate.append(" checks whether files are in CoNLL-X shared task format\n\n"); validate.append("args:\n"); validate.append(" FILES input files\n"); validate.append("\"\"\"\n\n"); validate.append("parser = optparse.OptionParser(usage, version=__version__)\n\n"); validate.append("problems_types = [ 'cycle', # cycles in dependency structure\n"); validate.append(" #'index', # index of head points to non-existent token\n"); validate.append(" 'punct', # problem with punctuation\n"); validate.append(" 'whitespace', # missing or superfluous whitespace\n"); validate.append(" 'root', # problem with token linking to root\n"); validate.append(" 'other' # anything else\n"); validate.append(" ]\n\n"); validate.append("parser.add_option('-d', '--discard_problems',\n"); validate.append(" dest='discard_problems',\n"); validate.append(" metavar='STRING',\n"); validate.append(" choices=problems_types,\n"); validate.append(" action='append',\n"); validate.append(" default = []\n"); validate.append(" ) # needed only for checkCycles functions\n\n"); validate.append("parser.add_option('-e', '--encoding',\n"); validate.append(" dest='encoding',\n"); validate.append(" metavar='STRING',\n"); validate.append(" action='store',\n"); validate.append(" default='utf-8',\n"); validate.append(" help=\"output character encoding (default is utf-8)\")\n\n"); validate.append("parser.add_option('-i', '--input_separator',\n"); validate.append(" dest='input_sep',\n"); validate.append(" metavar='STRING',\n"); validate.append(" action='store',\n"); validate.append(" default='\\t', # tab ### default=' +' # spaces\n"); validate.append(" help=\"\"\"regular expression for column separator in\n"); validate.append(" input (default is one tab, i.e. '\\\\t')\"\"\")\n\n"); validate.append("parser.add_option('-p', '--punctuation',\n"); validate.append(" dest='punctPostag',\n"); validate.append(" metavar='STRING',\n"); validate.append(" action='store',\n"); validate.append(" default='',\n"); validate.append(" help=\"\"\"use given regular expression to identify\n"); validate.append(" punctuation (by matching with the CPOSTAG column)\n"); validate.append(" and check that nothing links to and that a sentence\n"); validate.append(" contains more than just punctuation (default: turned off)\"\"\")\n\n"); validate.append("parser.add_option('-r', '--root_deprel',\n"); validate.append(" dest='rootDeprel',\n"); validate.append(" metavar='STRING',\n"); validate.append(" action='store',\n"); validate.append(" default='', # no root specified\n"); validate.append(" help=\"\"\"designated root label: check that there is exactly\n"); validate.append(" one token with that label and that it's HEAD is 0\n"); validate.append(" (default: not specified)\"\"\")\n\n"); validate.append("parser.add_option('-s', '--silence_warnings',\n"); validate.append(" dest='silence_warnings',\n"); validate.append(" metavar='STRING',\n"); validate.append(" choices=problems_types,\n"); validate.append(" action='append',\n"); validate.append(" default = [],\n"); validate.append(" help=\"\"\"don't warn about certain types of\n"); validate.append(" problems (default is to warn about every problem);\n"); validate.append(" possible choices:\"\"\"+' '.join(problems_types)\n"); validate.append(" )\n\n"); validate.append("parser.add_option('-t', '--type',\n"); validate.append(" dest='datatype',\n"); validate.append(" metavar='STRING',\n"); validate.append(" action='store',\n"); validate.append(" default='train', # training data\n"); validate.append(" help=\"\"\"type of the data to be tested: train,\n"); validate.append(" test_blind, system (default: train)\"\"\")\n\n"); validate.append("(options, args) = parser.parse_args()\n\n"); validate.append("# to enable random access to values\n"); validate.append("options.discard_problems = dict.fromkeys(options.discard_problems)\n"); validate.append("options.silence_warnings = dict.fromkeys(options.silence_warnings)\n\n"); validate.append("# how many columns there should be\n"); validate.append("minNumCols = 10 # default # for 'train'\n"); validate.append("maxNumCols = 10 # default # for 'train'\n"); validate.append("if options.datatype == 'test_blind':\n"); validate.append(" minNumCols = 6\n"); validate.append(" maxNumCols = 6\n"); validate.append("elif options.datatype == 'system':\n"); validate.append(" minNumCols = 8\n"); validate.append(" maxNumCols = 10\n"); validate.append("elif options.datatype != 'train':\n"); validate.append(" print >>sys.stderr, 'Incorrect value for option -t: must be train, test_blind or system'\n"); validate.append(" sys.exit(1)\n\n"); validate.append("exit_status = 0\n\n"); validate.append("if not args:\n"); validate.append(" print >>sys.stderr, 'Incorrect number of arguments'\n"); validate.append(" sys.exit(1)\n"); validate.append("else:\n"); validate.append(" for infile in args:\n"); validate.append(" validate(infile,\n"); validate.append(" codecs.open(infile, 'r', 'utf-8'), # encoding option\n"); validate.append(" sys.stdout,\n"); validate.append(" options)\n\n"); validate.append("print >>sys.stderr, 'Exit status = ',exit_status\n"); validate.append("sys.exit(exit_status)\n"); return validate.toString(); } public static String generateSharedTaskCommon() { StringBuilder task = new StringBuilder(); task.append("#!/usr/local/bin/python\n\n"); task.append("import sys\n"); task.append("import string\n\n"); task.append("rootDeprel = u'ROOT' # the dependency relation for the root\n"); task.append("emptyFeatsString = u'_' # if no morphological features exist (only PoS)\n"); task.append("featsJoiner = u'|' # to join morphological features into one string\n"); task.append("emptyProjColumnString = u'_' # if no PHEAD or PDEPREL available\n\n"); task.append("class NonTerminal:\n"); task.append(" def __init__(self,constLabel,features,deprel):\n"); task.append(" self.constLabel = constLabel\n"); task.append(" self.features = features\n"); task.append(" self.head = {} # a dictionary of references to the lexical heads\n"); task.append(" self.deprel = deprel\n"); task.append(" self.children = []\n\n"); task.append(" def getLexHead(self,head_type):\n"); task.append(" if not self.head.has_key(head_type): # does not have this head type\n"); task.append(" # this can happen if a proper head child could not be found\n"); task.append(" # according to the normal head rules and the default rules\n"); task.append(" # have been applied, resulting e.g. in an NP being the\n"); task.append(" # head of a finite clause\n"); task.append(" head_type = 'head' # take default head type\n"); task.append(" return self.head[head_type]\n\n\n"); task.append("class Terminal:\n"); task.append(" def __init__(self, id, form, lemma, cpostag, postag, feats, deprel,\n"); task.append(" phead = emptyProjColumnString, pdeprel = emptyProjColumnString):\n"); task.append(" self.id = id\n"); task.append(" self.form = form\n"); task.append(" self.lemma = lemma\n"); task.append(" self.cpostag = cpostag\n"); task.append(" self.postag = postag\n"); task.append(" self.feats = feats\n"); task.append(" self.deprel = deprel\n"); task.append(" self.phead = phead\n"); task.append(" self.pdeprel = pdeprel\n"); task.append(" # initially, a terminal links to itself;\n"); task.append(" # needed for recursive percolation of lexical heads\n"); task.append(" self.head = self\n\n"); task.append(" def getLexHead(self,head_type):\n"); task.append(" # the head_type is irrelevant:\n"); task.append(" # terminals only have one head\n"); task.append(" return self.head\n\n"); task.append("class CorpusError(Exception):\n"); task.append(" def __init__(self, value):\n"); task.append(" self.value = value\n"); task.append(" def __str__(self):\n"); task.append(" return `self.value`\n\n\n"); task.append("def processOptionsBlanks(options):\n"); task.append(" \"\"\"turn string of column widths (e.g. '3|10|10|5')\n"); task.append(" into list (e.g. [3,10,10,5])\"\"\"\n\n"); task.append(" if options.blanks:\n"); task.append(" list = options.blanks.split('|')\n"); task.append(" if len(list) != len(options.output):\n"); task.append(" print >>sys.stderr, (\"Value to blanks option does not \\\n"); task.append("have same number of elements as output columns chosen:\\n\\\n"); task.append("%s != %s\" % (list,options.output))\n"); task.append(" sys.exit(-1)\n"); task.append(" for i in range(len(list)):\n"); task.append(" try:\n"); task.append(" int = string.atoi(list[i])\n"); task.append(" except ValueError:\n"); task.append(" print >>sys.stderr, (\"Non-integer value in blanks option: %s\" %\n"); task.append(" list[i])\n"); task.append(" sys.exit(-1)\n"); task.append(" else:\n"); task.append(" list[i] = int\n"); task.append(" options.blanks = list\n\n\n"); task.append("# obsolete: just use '... = dict.fromkeys(list)' instead\n"); task.append("# thanks to EM for ponting this out\n"); task.append("#def turnListIntoHash(list):\n"); task.append("# hash = {}\n"); task.append("# for i in range(len(list)):\n"); task.append("# hash[list[i]] = 1\n"); task.append("# return hash\n\n\n"); task.append("def handleProblem(infile, problem_type, msg, options):\n"); task.append(" \"depending on options: raise exception or just warn or stay silent\"\n"); task.append(" if options.discard_problems.has_key(problem_type):\n"); task.append(" raise CorpusError, msg\n"); task.append(" else:\n"); task.append(" if not options.silence_warnings.has_key(problem_type):\n"); task.append(" print >>sys.stderr, (\"%s: Warning: %s\" %\n"); task.append(" (infile, msg)).encode(options.encoding)\n\n\n"); task.append("def addOptions(parser):\n"); task.append(" # what types of problems can occur during conversion;\n"); task.append(" # list can be used to selectively silence warnings\n"); task.append(" # or discard sentences (or files, with 'XML') that\n"); task.append(" # have those problems\n"); task.append(" problems_types = [ 'XML', # error in XML parsing\n"); task.append(" 'cycle', # cycles in dependency structure\n"); task.append(" 'label', # wrong POS/constituent/function label\n"); task.append(" 'index', # index of head points to non-existent token\n"); task.append(" 'punct', # problem in reattaching children of punctuation\n"); task.append(" 'whitespace', # missing or superfluous whitespace\n"); task.append(" 'discontinuity', # problem relating to annotation of discontinuity\n"); task.append(" 'ambiguity', # ambiguous structure annotated in treebank\n"); task.append(" 'tree', # problem with structure of tree (not discontinuity)\n"); task.append(" 'head_table', # cannot find head child\n"); task.append(" 'other' # anything else\n"); task.append(" ]\n\n"); task.append(" parser.add_option('-b', '--blanks',\n"); task.append(" dest='blanks',\n"); task.append(" action='store',\n"); task.append(" metavar='FORMAT',\n"); task.append(" default='',\n"); task.append(" help=\"\"\"\n"); task.append(" use variable number of blanks as\n"); task.append(" output column separator (default is tab);\n"); task.append(" expects argument FORMAT of form: i|j|k|...\n"); task.append(" where i,j,k etc. are integer>0, indicating the minimum\n"); task.append(" width of that column (there must be as many integers as\n"); task.append(" columns requested in the output)\n"); task.append(" \"\"\"\n"); task.append(" )\n\n"); task.append(" parser.add_option('-c', '--condition',\n"); task.append(" dest='condition',\n"); task.append(" action='store',\n"); task.append(" metavar='CONDITION',\n"); task.append(" default='',\n"); task.append(" help=\"\"\"use only those files/extracts/sentences that\n"); task.append(" fulfill CONDITION (e.g. <743 or >=743); useful for\n"); task.append(" splitting into training and test set\"\"\"\n"); task.append(" )\n\n"); task.append(" parser.add_option('-d', '--discard_problems',\n"); task.append(" dest='discard_problems',\n"); task.append(" choices=problems_types,\n"); task.append(" action='append',\n"); task.append(" default = [],\n"); task.append(" help=\"\"\"discard sentence (or file, for XML problems) that\n"); task.append(" exhibits certain problems (default is fix, not discard);\n"); task.append(" possible choices:\"\"\"+' '.join(problems_types)\n"); task.append(" )\n\n"); task.append(" parser.add_option('-e', '--encoding',\n"); task.append(" dest='encoding',\n"); task.append(" action='store',\n"); task.append(" default='utf-8',\n"); task.append(" help=\"output character encoding (default is utf-8)\")\n\n"); task.append(" parser.add_option('-f', '--file',\n"); task.append(" dest='file',\n"); task.append(" action='store_true',\n"); task.append(" default=False,\n"); task.append(" help=\"\"\"write output to file, replacing original\n"); task.append(" suffix by .conll (default is to standard output)\"\"\"\n"); task.append(" )\n\n"); task.append(" parser.add_option('-o', '--output',\n"); task.append(" dest='output',\n"); task.append(" choices=['id','form','lemma','cpostag','postag',\n"); task.append(" 'feats','head','deprel','phead','pdeprel'],\n"); task.append(" action='append',\n"); task.append(" default = [],\n"); task.append(" help=\"\"\"print named column in output, in order\n"); task.append(" specified on command line(default is none);\n"); task.append(" possible choices:\n"); task.append(" 'id','form','lemma','cpostag','postag',\n"); task.append(" 'feats','head','deprel','phead','pdeprel'\"\"\"\n"); task.append(" )\n\n"); task.append(" parser.add_option('-s', '--silence_warnings',\n"); task.append(" dest='silence_warnings',\n"); task.append(" choices=problems_types,\n"); task.append(" action='append',\n"); task.append(" default = [],\n"); task.append(" help=\"\"\"don't warn about certain types of conversion\n"); task.append(" problems (default is to warn about every problem);\n"); task.append(" possible choices:\"\"\"+' '.join(problems_types)\n"); task.append(" )\n\n"); task.append(" parser.add_option('-p', '--punctuation',\n"); task.append(" dest='punctuation',\n"); task.append(" action='store_true',\n"); task.append(" default=False,\n"); task.append(" help='links words linking to punctuation to punctuation\\'s head instead'\n"); task.append(" )\n\n\n\n"); task.append("def checkCycles(infile, options, token_list, rootFunction):\n"); task.append(" for i in range(1,len(token_list)):\n"); task.append(" head_path = { i: 1 }\n"); task.append(" j = i\n"); task.append(" while j != 0:\n"); task.append(" j = token_list[ j ]['head']\n"); task.append(" if head_path.has_key(j): # cycle found!\n"); task.append(" # raise exception or just warn or stay silent\n"); task.append(" msg = (u\"Cycle detected at token %d (%s)\" %\n"); task.append(" (j, token_list[ j ]['form']))\n"); task.append(" handleProblem(infile, 'cycle', msg, options)\n"); task.append(" # break cycle by linking token to root\n"); task.append(" token_list[ j ]['head'] = 0\n"); task.append(" token_list[ j ]['deprel'] = rootFunction\n"); task.append(" break\n"); task.append(" else:\n"); task.append(" head_path[j] = 1\n\n\n"); task.append("def checkCycles_tmp2(infile, options, token_list, rootFunction):\n"); task.append(" for i in range(1,len(token_list)):\n"); task.append(" head_path = { i: 1 }\n"); task.append(" j = i\n"); task.append(" while j != 0:\n"); task.append(" j = token_list[ j ].head\n"); task.append(" if head_path.has_key(j): # cycle found!\n"); task.append(" # raise exception or just warn or stay silent\n"); task.append(" msg = (u\"Cycle detected at token %d (%s)\" %\n"); task.append(" (j, token_list[ j ].form))\n"); task.append(" handleProblem(infile, 'cycle', msg, options)\n"); task.append(" # break cycle by linking token to root\n"); task.append(" token_list[ j ].head = 0\n"); task.append(" token_list[ j ].deprel = rootFunction\n"); task.append(" break\n"); task.append(" else:\n"); task.append(" head_path[j] = 1\n\n"); task.append("def checkCyclesPhead(infile, options, token_list, rootFunction):\n"); task.append(" for i in range(1,len(token_list)):\n"); task.append(" head_path = { i: 1 }\n"); task.append(" j = i\n"); task.append(" while j != 0 and token_list[ j ].phead != emptyProjColumnString:\n"); task.append(" # if PHEAD column contains dummy value, just stop checking\n\n"); task.append(" j = token_list[ j ].phead\n"); task.append(" if head_path.has_key(j): # cycle found!\n"); task.append(" # raise exception or just warn or stay silent\n"); task.append(" msg = (u\"PHEAD cycle detected at token %d (%s)\" %\n"); task.append(" (j, token_list[ j ].form))\n"); task.append(" handleProblem(infile, 'cycle', msg, options)\n"); task.append(" # break cycle by linking token to root\n"); task.append(" token_list[ j ].phead = 0\n"); task.append(" token_list[ j ].pdeprel = rootFunction\n"); task.append(" break\n"); task.append(" else:\n"); task.append(" head_path[j] = 1\n\n\n"); task.append("def attachPunctHigh(infile, options, token_list, punctuationPos,\n"); task.append(" punctuationFunction, rootFunction):\n"); task.append(" \"\"\"\n"); task.append(" Reattach punctuation as high as possible,\n"); task.append(" change deprel to value punctuationFunction.\n"); task.append(" \"\"\"\n\n"); task.append(" for i in range(1,len(token_list)):\n"); task.append(" token1 = token_list[ i ]\n"); task.append(" if token1['postag'] == punctuationPos:\n"); task.append(" punc = token1\n\n"); task.append(" # find highest attachment point\n"); task.append(" highest = 0\n"); task.append(" head_path = {}\n"); task.append(" if i>1:\n"); task.append(" j=i-1\n"); task.append(" while token_list[ j ]['head'] != 0:\n"); task.append(" if head_path.has_key(j):\n"); task.append(" # raise exception or just warn or stay silent\n"); task.append(" msg = (u\"Cycle detected at token %d (%s)\" %\n"); task.append(" (j, token_list[ j ]['form']))\n"); task.append(" handleProblem(infile, 'cycle', msg, options)\n"); task.append(" # break cycle by linking token to root\n"); task.append(" token_list[ j ]['head'] = 0\n"); task.append(" token_list[ j ]['deprel'] = rootFunction\n"); task.append(" break\n"); task.append(" head_path[j] = 1\n"); task.append(" j = token_list[ j ]['head']\n"); task.append(" highest = j\n"); task.append(" if i<len(token_list)-1:\n"); task.append(" j=i+1\n"); task.append(" while token_list[ j ]['head'] != 0:\n"); task.append(" if head_path.has_key(j):\n"); task.append(" if head_path[j] == 2:\n"); task.append(" # raise exception or just warn or stay silent\n"); task.append(" msg = (u\"Cycle detected at token %d (%s)\" %\n"); task.append(" (j, token_list[ j ]['form']))\n"); task.append(" handleProblem(infile, 'cycle', msg, options)\n"); task.append(" # break cycle by linking token to root\n"); task.append(" token_list[ j ]['head'] = 0\n"); task.append(" token_list[ j ]['deprel'] = rootFunction\n"); task.append(" break\n"); task.append(" elif head_path[j] == 1:\n"); task.append(" # was also on other path\n"); task.append(" break\n"); task.append(" head_path[j] = 2\n"); task.append(" j=token_list[ j ]['head']\n"); task.append(" highest = j\n\n"); task.append(" # make punctuation link to highest\n"); task.append(" punc['head'] = highest\n"); task.append(" if highest == 0:\n"); task.append(" punc['deprel'] = rootFunction\n"); task.append(" else:\n"); task.append(" punc['deprel'] = punctuationFunction\n\n"); task.append(" return token_list\n\n\n"); task.append("def printSentences(sent_list, options, outstream):\n"); task.append(" \"\"\"\n"); task.append(" print all sentences in sent_list;\n"); task.append(" tokens are dictionaries\n"); task.append(" \"\"\"\n"); task.append(" # ??? should format string be unicode string regardless of options.encoding?\n\n"); task.append(" format = []\n"); task.append(" for j in range(len(options.output)): # for each column\n"); task.append(" if options.blanks:\n"); task.append(" width = options.blanks[j] # pad with blanks\n"); task.append(" if j < len(options.output)-1: # non-last column\n"); task.append(" format_string = u'%'+`width`+u's ' # e.g. u\"%-15s \"\n"); task.append(" else: # last column\n"); task.append(" format_string = u'%'+`width`+u's' # e.g. u\"%-15s\"\n"); task.append(" else: # separate by tab\n"); task.append(" if j < len(options.output)-1: # non-last column\n"); task.append(" format_string = u'%s\\t'\n"); task.append(" else: # last column\n"); task.append(" format_string = u'%s'\n"); task.append(" format.append(format_string)\n\n"); task.append(" for sent in sent_list: # for each sentence\n"); task.append(" word_count = 0\n"); task.append(" for i in range(1,len(sent)): # for each token\n"); task.append(" token = sent[i]\n"); task.append(" word_count += 1\n"); task.append(" for j in range(len(options.output)): # for each column\n"); task.append(" column_name = options.output[j]\n"); task.append(" if column_name == 'id':\n"); task.append(" output_string = format[j] % word_count\n"); task.append(" else:\n"); task.append(" value = token[column_name] # get value for column\n"); task.append(" if column_name == 'feats':\n"); task.append(" if value == []: # if no features:\n"); task.append(" value = emptyFeatsString # use default value\n"); task.append(" else:\n"); task.append(" value = featsJoiner.join(value) # else: join\n"); task.append(" output_string = format[j] % value # format string\n"); task.append(" outstream.write(output_string.encode(options.encoding)) # print\n"); task.append(" outstream.write(\"\\n\") # newline at end of token\n"); task.append(" outstream.write(\"\\n\") # extra newline at end of token\n\n\n"); task.append("def printSentences_tmp2(sent_list, options, outstream):\n"); task.append(" \"\"\"\n"); task.append(" print all sentences in sent_list;\n"); task.append(" tokens are class instances\n"); task.append(" \"\"\"\n"); task.append(" # ??? should format string be unicode string regardless of options.encoding?\n\n"); task.append(" format = []\n"); task.append(" for j in range(len(options.output)): # for each column\n"); task.append(" if options.blanks:\n"); task.append(" width = options.blanks[j] # pad with blanks\n"); task.append(" if j < len(options.output)-1: # non-last column\n"); task.append(" format_string = u'%'+`width`+u's ' # e.g. u\"%-15s \"\n"); task.append(" else: # last column\n"); task.append(" format_string = u'%'+`width`+u's' # e.g. u\"%-15s\"\n"); task.append(" else: # separate by tab\n"); task.append(" if j < len(options.output)-1: # non-last column\n"); task.append(" format_string = u'%s\\t'\n"); task.append(" else: # last column\n"); task.append(" format_string = u'%s'\n"); task.append(" format.append(format_string)\n\n"); task.append(" for sent in sent_list: # for each sentence\n"); task.append(" word_count = 0\n"); task.append(" for i in range(1,len(sent)): # for each token\n"); task.append(" token = sent[i]\n"); task.append(" word_count += 1\n"); task.append(" for j in range(len(options.output)): # for each column\n"); task.append(" column_name = options.output[j]\n"); task.append(" if column_name == 'id':\n"); task.append(" output_string = format[j] % word_count # format string\n"); task.append(" # ??? check that word count is same as ID?\n"); task.append(" else:\n"); task.append(" value = getattr(token,column_name) # get value for column\n"); task.append(" if column_name == 'feats':\n"); task.append(" if value == []: # if no features:\n"); task.append(" value = emptyFeatsString # use default value\n"); task.append(" else:\n"); task.append(" value = featsJoiner.join(value) # else: join\n"); task.append(" output_string = format[j] % value # format string\n"); task.append(" outstream.write(output_string.encode(options.encoding)) # print\n"); task.append(" outstream.write(\"\\n\") # newline at end of token\n"); task.append(" outstream.write(\"\\n\") # extra newline at end of token\n"); return task.toString(); } public static String generateEval07() { StringBuilder task = new StringBuilder(); task.append("#!/usr/bin/env perl\n"); task.append("# $Id: eval07.pl,v 1.4 2007/04/02 21:17:34 dyuret Exp $\n\n"); task.append("# Author: Yuval Krymolowski\n"); task.append("# Addition of precision and recall \n"); task.append("# and of frame confusion list: Sabine Buchholz\n"); task.append("# Addition of DEPREL + ATTACHMENT:\n"); task.append("# Prokopis Prokopidis (prokopis at ilsp dot gr)\n"); task.append("# Acknowledgements: \n"); task.append("# to Markus Kuhn for suggesting the use of \n"); task.append("# the Unicode category property\n"); task.append("# Adaptation to CoNLL-07:\n"); task.append("# Deniz Yuret (denizyuret at gmail dot com)\n\n"); task.append("if ($] < 5.008001)\n"); task.append("{\n"); task.append(" printf STDERR <<EOM\n\n"); task.append(" This script requires PERL 5.8.1 for running.\n"); task.append(" The new version is needed for proper handling\n"); task.append(" of Unicode characters.\n\n"); task.append(" Please obtain a new version or contact the shared task team\n"); task.append(" if you are unable to upgrade PERL.\n\n"); task.append("EOM\n"); task.append(";\n"); task.append(" exit(1) ;\n"); task.append("}\n\n"); task.append("require Encode;\n\n"); task.append("use strict ;\n"); task.append("use warnings;\n"); task.append("use Getopt::Std ;\n\n"); task.append("my ($usage) = <<EOT\n\n"); task.append(" CoNLL-07 evaluation script:\n\n"); task.append(" [perl] eval.pl [OPTIONS] -g <gold standard> -s <system output>\n\n"); task.append(" This script evaluates a system output with respect to a gold standard.\n"); task.append(" Both files should be in UTF-8 encoded CoNLL-07 tabular format.\n\n"); task.append(" The output breaks down the errors according to their type and context.\n\n"); task.append(" Optional parameters:\n"); task.append(" -o FILE : output: print output to FILE (default is standard output)\n"); task.append(" -q : quiet: only print overall performance, without the details\n"); task.append(" -b : evalb: produce output in a format similar to evalb \n"); task.append(" (http://nlp.cs.nyu.edu/evalb/); use together with -q\n"); task.append(" -p : punctuation: do not score punctuation (default is to score)\n"); task.append(" -d : deriv: do not score on DERIV links (default is to score)\n"); task.append(" -v : version: show the version number\n"); task.append(" -h : help: print this help text and exit\n\n"); task.append("EOT\n"); task.append(";\n\n"); task.append("my ($line_num) ;\n"); task.append("my ($sep) = '0x01' ;\n\n"); task.append("my ($START) = '.S' ;\n"); task.append("my ($END) = '.E' ;\n\n"); task.append("my ($con_err_num) = 3 ;\n"); task.append("my ($freq_err_num) = 10 ;\n"); task.append("my ($spec_err_loc_con) = 8 ;\n\n"); task.append("our ($opt_g, $opt_s, $opt_o, $opt_h, $opt_v, $opt_q, $opt_p, $opt_b, $opt_d) ;\n"); task.append("my ($word_mismatch_warning);\n\n\n"); task.append("################################################################################\n"); task.append("### subfunctions ###\n"); task.append("################################################################################\n\n"); task.append("# Whether a string consists entirely of characters with the Unicode\n"); task.append("# category property \"Punctuation\" (see \"man perlunicode\")\n"); task.append("sub is_uni_punct\n"); task.append("{\n"); task.append(" my ($word) = @_ ;\n\n"); task.append(" return scalar(Encode::decode_utf8($word)=~ /^\\p{Punctuation}+$/) ;\n"); task.append("}\n\n"); task.append("# The length of a unicode string, excluding non-spacing marks\n"); task.append("# (for example vowel marks in Arabic)\n\n"); task.append("sub uni_len\n"); task.append("{\n"); task.append(" my ($word) = @_ ;\n"); task.append(" my ($ch, $l) ;\n\n"); task.append(" $l = 0 ;\n"); task.append(" foreach $ch (split(//, Encode::decode_utf8($word)))\n"); task.append(" {\n"); task.append(" if ($ch !~ /^\\p{NonspacingMark}/)\n"); task.append(" {\n"); task.append(" $l++ ;\n"); task.append(" }\n"); task.append(" }\n\n"); task.append(" return $l ;\n"); task.append("}\n\n"); task.append("sub filter_context_counts\n"); task.append("{ # filter_context_counts\n\n"); task.append(" my ($vec, $num, $max_len) = @_ ;\n"); task.append(" my ($con, $l, $thresh) ;\n\n"); task.append(" $thresh = (sort {$b <=> $a} values %{$vec})[$num-1] ;\n\n"); task.append(" foreach $con (keys %{$vec})\n"); task.append(" {\n"); task.append(" if (${$vec}{$con} < $thresh)\n"); task.append(" {\n"); task.append(" delete ${$vec}{$con} ;\n"); task.append(" next ;\n"); task.append(" }\n\n"); task.append(" $l = uni_len($con) ;\n\n"); task.append(" if ($l > ${$max_len})\n"); task.append(" {\n"); task.append(" ${$max_len} = $l ;\n"); task.append(" }\n"); task.append(" }\n"); task.append("\n"); task.append("} # filter_context_counts\n\n"); task.append("sub print_context\n"); task.append("{ # print_context\n\n"); task.append(" my ($counts, $counts_pos, $max_con_len, $max_con_pos_len) = @_ ;\n"); task.append(" my (@v_con, @v_con_pos, $con, $con_pos, $i, $n) ;\n\n"); task.append(" printf OUT \" %-*s | %-4s | %-4s | %-4s | %-4s\", $max_con_pos_len, 'CPOS', 'any', 'head', 'dep', 'both' ;\n"); task.append(" printf OUT \" ||\" ;\n"); task.append(" printf OUT \" %-*s | %-4s | %-4s | %-4s | %-4s\", $max_con_len, 'word', 'any', 'head', 'dep', 'both' ;\n"); task.append(" printf OUT \"\\n\" ;\n"); task.append(" printf OUT \" %s-+------+------+------+-----\", '-' x $max_con_pos_len;\n"); task.append(" printf OUT \"--++\" ;\n"); task.append(" printf OUT \"--%s-+------+------+------+-----\", '-' x $max_con_len;\n"); task.append(" printf OUT \"\\n\" ;\n\n"); task.append(" @v_con = sort {${$counts}{tot}{$b} <=> ${$counts}{tot}{$a}} keys %{${$counts}{tot}} ;\n"); task.append(" @v_con_pos = sort {${$counts_pos}{tot}{$b} <=> ${$counts_pos}{tot}{$a}} keys %{${$counts_pos}{tot}} ;\n\n"); task.append(" $n = scalar @v_con ;\n"); task.append(" if (scalar @v_con_pos > $n)\n"); task.append(" {\n"); task.append(" $n = scalar @v_con_pos ;\n"); task.append(" }\n\n"); task.append(" foreach $i (0 .. $n-1)\n"); task.append(" {\n"); task.append(" if (defined $v_con_pos[$i])\n"); task.append(" {\n"); task.append(" $con_pos = $v_con_pos[$i] ;\n"); task.append(" printf OUT \" %-*s | %4d | %4d | %4d | %4d\",\n"); task.append(" $max_con_pos_len, $con_pos, ${$counts_pos}{tot}{$con_pos},\n"); task.append(" ${$counts_pos}{err_head}{$con_pos}, ${$counts_pos}{err_dep}{$con_pos},\n"); task.append(" ${$counts_pos}{err_dep}{$con_pos}+${$counts_pos}{err_head}{$con_pos}-${$counts_pos}{tot}{$con_pos} ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" printf OUT \" %-*s | %4s | %4s | %4s | %4s\",\n"); task.append(" $max_con_pos_len, ' ', ' ', ' ', ' ', ' ' ;\n"); task.append(" }\n\n"); task.append(" printf OUT \" ||\" ;\n\n"); task.append(" if (defined $v_con[$i])\n"); task.append(" {\n"); task.append(" $con = $v_con[$i] ;\n"); task.append(" printf OUT \" %-*s | %4d | %4d | %4d | %4d\",\n"); task.append(" $max_con_len+length($con)-uni_len($con), $con, ${$counts}{tot}{$con},\n"); task.append(" ${$counts}{err_head}{$con}, ${$counts}{err_dep}{$con},\n"); task.append(" ${$counts}{err_dep}{$con}+${$counts}{err_head}{$con}-${$counts}{tot}{$con} ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" printf OUT \" %-*s | %4s | %4s | %4s | %4s\",\n"); task.append(" $max_con_len, ' ', ' ', ' ', ' ', ' ' ;\n"); task.append(" }\n\n"); task.append(" printf OUT \"\\n\" ;\n"); task.append(" }\n\n"); task.append(" printf OUT \" %s-+------+------+------+-----\", '-' x $max_con_pos_len;\n"); task.append(" printf OUT \"--++\" ;\n"); task.append(" printf OUT \"--%s-+------+------+------+-----\", '-' x $max_con_len;\n"); task.append(" printf OUT \"\\n\" ;\n\n"); task.append(" printf OUT \"\\n\\n\" ;\n\n"); task.append("} # print_context\n\n"); task.append("sub num_as_word\n"); task.append("{\n"); task.append(" my ($num) = @_ ;\n\n"); task.append(" $num = abs($num) ;\n\n"); task.append(" if ($num == 1)\n"); task.append(" {\n"); task.append(" return ('one word') ;\n"); task.append(" }\n"); task.append(" elsif ($num == 2)\n"); task.append(" {\n"); task.append(" return ('two words') ;\n"); task.append(" }\n"); task.append(" elsif ($num == 3)\n"); task.append(" {\n"); task.append(" return ('three words') ;\n"); task.append(" }\n"); task.append(" elsif ($num == 4)\n"); task.append(" {\n"); task.append(" return ('four words') ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" return ($num.' words') ;\n"); task.append(" }\n"); task.append("}\n\n"); task.append("sub describe_err\n"); task.append("{ # describe_err\n\n"); task.append(" my ($head_err, $head_aft_bef, $dep_err) = @_ ;\n"); task.append(" my ($dep_g, $dep_s, $desc) ;\n"); task.append(" my ($head_aft_bef_g, $head_aft_bef_s) = split(//, $head_aft_bef) ;\n\n"); task.append(" if ($head_err eq '-')\n"); task.append(" {\n"); task.append(" $desc = 'correct head' ;\n\n"); task.append(" if ($head_aft_bef_s eq '0')\n"); task.append(" {\n"); task.append(" $desc .= ' (0)' ;\n"); task.append(" }\n"); task.append(" elsif ($head_aft_bef_s eq 'e')\n"); task.append(" {\n"); task.append(" $desc .= ' (the focus word)' ;\n"); task.append(" }\n"); task.append(" elsif ($head_aft_bef_s eq 'a')\n"); task.append(" {\n"); task.append(" $desc .= ' (after the focus word)' ;\n"); task.append(" }\n"); task.append(" elsif ($head_aft_bef_s eq 'b')\n"); task.append(" {\n"); task.append(" $desc .= ' (before the focus word)' ;\n"); task.append(" }\n"); task.append(" }\n"); task.append(" elsif ($head_aft_bef_s eq '0')\n"); task.append(" {\n"); task.append(" $desc = 'head = 0 instead of ' ;\n"); task.append(" if ($head_aft_bef_g eq 'a')\n"); task.append(" {\n"); task.append(" $desc.= 'after ' ;\n"); task.append(" }\n"); task.append(" if ($head_aft_bef_g eq 'b')\n"); task.append(" {\n"); task.append(" $desc.= 'before ' ;\n"); task.append(" }\n"); task.append(" $desc .= 'the focus word' ;\n"); task.append(" }\n"); task.append(" elsif ($head_aft_bef_g eq '0')\n"); task.append(" {\n"); task.append(" $desc = 'head is ' ;\n"); task.append(" if ($head_aft_bef_g eq 'a')\n"); task.append(" {\n"); task.append(" $desc.= 'after ' ;\n"); task.append(" }\n"); task.append(" if ($head_aft_bef_g eq 'b')\n"); task.append(" {\n"); task.append(" $desc.= 'before ' ;\n"); task.append(" }\n"); task.append(" $desc .= 'the focus word instead of 0' ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $desc = num_as_word($head_err) ;\n"); task.append(" if ($head_err < 0)\n"); task.append(" {\n"); task.append(" $desc .= ' before' ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $desc .= ' after' ;\n"); task.append(" }\n\n"); task.append(" $desc = 'head '.$desc.' the correct head ' ;\n\n"); task.append(" if ($head_aft_bef_s eq '0')\n"); task.append(" {\n"); task.append(" $desc .= '(0' ;\n"); task.append(" }\n"); task.append(" elsif ($head_aft_bef_s eq 'e')\n"); task.append(" {\n"); task.append(" $desc .= '(the focus word' ;\n"); task.append(" }\n"); task.append(" elsif ($head_aft_bef_s eq 'a')\n"); task.append(" {\n"); task.append(" $desc .= '(after the focus word' ;\n"); task.append(" }\n"); task.append(" elsif ($head_aft_bef_s eq 'b')\n"); task.append(" {\n"); task.append(" $desc .= '(before the focus word' ;\n"); task.append(" }\n\n"); task.append(" if ($head_aft_bef_g ne $head_aft_bef_s)\n"); task.append(" {\n"); task.append(" $desc .= ' instead of' ;\n"); task.append(" if ($head_aft_bef_s eq '0')\n"); task.append(" {\n"); task.append(" $desc .= '0' ;\n"); task.append(" }\n"); task.append(" elsif ($head_aft_bef_s eq 'e')\n"); task.append(" {\n"); task.append(" $desc .= 'the focus word' ;\n"); task.append(" }\n"); task.append(" elsif ($head_aft_bef_s eq 'a')\n"); task.append(" {\n"); task.append(" $desc .= 'after the focus word' ;\n"); task.append(" }\n"); task.append(" elsif ($head_aft_bef_s eq 'b')\n"); task.append(" {\n"); task.append(" $desc .= 'before the focus word' ;\n"); task.append(" }\n"); task.append(" }\n\n"); task.append(" $desc .= ')' ;\n"); task.append(" }\n\n"); task.append(" $desc .= ', ' ;\n\n"); task.append(" if ($dep_err eq '-')\n"); task.append(" {\n"); task.append(" $desc .= 'correct dependency' ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" ($dep_g, $dep_s) = ($dep_err =~ /^(.*)->(.*)$/) ;\n"); task.append(" $desc .= sprintf('dependency \"%s\" instead of \"%s\"', $dep_s, $dep_g) ;\n"); task.append(" }\n\n"); task.append(" return($desc) ;\n\n"); task.append("} # describe_err\n\n"); task.append("sub get_context\n"); task.append("{ # get_context\n\n"); task.append(" my ($sent, $i_w) = @_ ;\n"); task.append(" my ($w_2, $w_1, $w1, $w2) ;\n"); task.append(" my ($p_2, $p_1, $p1, $p2) ;\n\n"); task.append(" if ($i_w >= 2)\n"); task.append(" {\n"); task.append(" $w_2 = ${${$sent}[$i_w-2]}{word} ;\n"); task.append(" $p_2 = ${${$sent}[$i_w-2]}{pos} ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $w_2 = $START ;\n"); task.append(" $p_2 = $START ;\n"); task.append(" }\n\n"); task.append(" if ($i_w >= 1)\n"); task.append(" {\n"); task.append(" $w_1 = ${${$sent}[$i_w-1]}{word} ;\n"); task.append(" $p_1 = ${${$sent}[$i_w-1]}{pos} ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $w_1 = $START ;\n"); task.append(" $p_1 = $START ;\n"); task.append(" }\n\n"); task.append(" if ($i_w <= scalar @{$sent}-2)\n"); task.append(" {\n"); task.append(" $w1 = ${${$sent}[$i_w+1]}{word} ;\n"); task.append(" $p1 = ${${$sent}[$i_w+1]}{pos} ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $w1 = $END ;\n"); task.append(" $p1 = $END ;\n"); task.append(" }\n\n"); task.append(" if ($i_w <= scalar @{$sent}-3)\n"); task.append(" {\n"); task.append(" $w2 = ${${$sent}[$i_w+2]}{word} ;\n"); task.append(" $p2 = ${${$sent}[$i_w+2]}{pos} ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $w2 = $END ;\n"); task.append(" $p2 = $END ;\n"); task.append(" }\n\n"); task.append(" return ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) ;\n\n"); task.append("} # get_context\n\n"); task.append("sub read_sent\n"); task.append("{ # read_sent\n\n"); task.append(" my ($sent_gold, $sent_sys) = @_ ;\n"); task.append(" my ($line_g, $line_s, $new_sent) ;\n"); task.append(" my (%fields_g, %fields_s) ;\n\n"); task.append(" $new_sent = 1 ;\n\n"); task.append(" @{$sent_gold} = () ;\n"); task.append(" @{$sent_sys} = () ;\n\n"); task.append(" while (1)\n"); task.append(" { # main reading loop\n\n"); task.append(" $line_g = <GOLD> ;\n"); task.append(" $line_s = <SYS> ;\n\n"); task.append(" $line_num++ ;\n\n"); task.append(" # system output has fewer lines than gold standard\n"); task.append(" if ((defined $line_g) && (! defined $line_s))\n"); task.append(" {\n"); task.append(" if ($line_g =~ /^\\s*$/) {\n"); task.append(" warn \"Warning: ignoring missing blank line at the end of $opt_s.\\n\";\n"); task.append(" next;\n"); task.append(" }\n"); task.append(" printf STDERR \"Fatal: line mismatch, line %d:\\n\", $line_num ;\n"); task.append(" printf STDERR \" gold: %s\", $line_g ;\n"); task.append(" printf STDERR \" sys : past end of file\\n\" ;\n"); task.append(" exit(1) ;\n"); task.append(" }\n\n"); task.append(" # system output has more lines than gold standard\n"); task.append(" if ((! defined $line_g) && (defined $line_s))\n"); task.append(" {\n"); task.append(" if ($line_s =~ /^\\s*$/) {\n"); task.append(" warn \"Warning: ignoring extra blank line at the end of $opt_s.\\n\";\n"); task.append(" next;\n"); task.append(" }\n"); task.append(" printf STDERR \"Fatal: line mismatch, line %d:\\n\", $line_num ;\n"); task.append(" printf STDERR \" gold: past end of file\\n\" ;\n"); task.append(" printf STDERR \" sys : %s\", $line_s ;\n"); task.append(" exit(1) ;\n"); task.append(" }\n\n"); task.append(" # end of file reached for both\n"); task.append(" if ((! defined $line_g) && (! defined $line_s))\n"); task.append(" {\n"); task.append(" return (1) ;\n"); task.append(" }\n\n"); task.append(" # one contains end of sentence but other one does not\n"); task.append(" if (($line_g =~ /^\\s+$/) != ($line_s =~ /^\\s+$/))\n"); task.append(" {\n"); task.append(" printf STDERR \"Fatal: line mismatch, line %d:\\n\", $line_num ;\n"); task.append(" printf STDERR \" gold: %s\", $line_g ;\n"); task.append(" printf STDERR \" sys : %s\", $line_s ;\n"); task.append(" exit(1) ;\n"); task.append(" }\n\n"); task.append(" # end of sentence reached\n"); task.append(" if ($line_g =~ /^\\s+$/)\n"); task.append(" {\n"); task.append(" return(0) ;\n"); task.append(" }\n\n"); task.append(" # now both lines contain information\n\n"); task.append(" if ($new_sent)\n"); task.append(" {\n"); task.append(" $new_sent = 0 ;\n"); task.append(" }\n\n"); task.append(" # 'official' column names\n"); task.append(" # options.output = ['id','form','lemma','cpostag','postag',\n"); task.append(" # 'feats','head','deprel','phead','pdeprel']\n\n"); task.append(" @fields_g{'word', 'pos', 'head', 'dep'} = (split (/\\s+/, $line_g))[1, 3, 6, 7] ;\n\n"); task.append(" push @{$sent_gold}, { %fields_g } ;\n\n"); task.append(" @fields_s{'word', 'pos', 'head', 'dep'} = (split (/\\s+/, $line_s))[1, 3, 6, 7] ;\n\n"); task.append("# Some teams like to change the word or the pos in the answer file...\n"); task.append("# So do not make this fatal and only give one warning.\n\n"); task.append(" if ((not defined $word_mismatch_warning) &&\n"); task.append(" (($fields_g{word} ne $fields_s{word}) ||\n"); task.append(" ($fields_g{pos} ne $fields_s{pos})))\n"); task.append(" {\n"); task.append(" $word_mismatch_warning = 1;\n"); task.append(" printf STDERR \"Warning: ignoring word/pos mismatch, line %d:\\n\", $line_num ;\n"); task.append(" printf STDERR \" gold: %s\", $line_g ;\n"); task.append(" printf STDERR \" sys : %s\", $line_s ;\n"); task.append(" # exit(1) ;\n"); task.append(" }\n\n"); task.append(" push @{$sent_sys}, { %fields_s } ;\n\n"); task.append(" } # main reading loop\n\n"); task.append("} # read_sent\n\n"); task.append("################################################################################\n"); task.append("### main ###\n"); task.append("################################################################################\n\n"); task.append("my ($sent_num, $eof, $word_num, @err_sent) ;\n"); task.append("my (@sent_gold, @sent_sys, @starts) ;\n"); task.append("my ($word, $pos, $wp, $head_g, $dep_g, $head_s, $dep_s) ;\n"); task.append("my (%counts, $err_head, $err_dep, $con, $con1, $con_pos, $con_pos1, $thresh) ;\n"); task.append("my ($head_err, $dep_err, @cur_err, %err_counts, $err_counter, $err_desc) ;\n"); task.append("my ($loc_con, %loc_con_err_counts, %err_desc) ;\n"); task.append("my ($head_aft_bef_g, $head_aft_bef_s, $head_aft_bef) ;\n"); task.append("my ($con_bef, $con_aft, $con_bef_2, $con_aft_2, @bits, @e_bits, @v_con, @v_con_pos) ;\n"); task.append("my ($con_pos_bef, $con_pos_aft, $con_pos_bef_2, $con_pos_aft_2) ;\n"); task.append("my ($max_word_len, $max_pos_len, $max_con_len, $max_con_pos_len) ;\n"); task.append("my ($max_word_spec_len, $max_con_bef_len, $max_con_aft_len) ;\n"); task.append("my (%freq_err, $err) ;\n\n"); task.append("my ($i, $j, $i_w, $l, $n_args) ;\n"); task.append("my ($w_2, $w_1, $w1, $w2) ;\n"); task.append("my ($wp_2, $wp_1, $wp1, $wp2) ;\n"); task.append("my ($p_2, $p_1, $p1, $p2) ;\n\n"); task.append("my ($short_output) ;\n"); task.append("my ($score_on_punct, $score_on_deriv) ;\n"); task.append("$counts{punct} = 0; # initialize\n"); task.append("$counts{deriv} = 0;\n\n"); task.append("getopts(\"g:o:s:qvhpbd\") ;\n\n"); task.append("if (defined $opt_v)\n"); task.append("{\n"); task.append(" my $id = '$Id: eval07.pl,v 1.4 2007/04/02 21:17:34 dyuret Exp $';\n"); task.append(" my @parts = split ' ',$id;\n"); task.append(" print \"Version $parts[2]\\n\";\n"); task.append(" exit(0);\n"); task.append("}\n\n"); task.append("if ((defined $opt_h) || ((! defined $opt_g) && (! defined $opt_s)))\n"); task.append("{\n"); task.append(" die $usage ;\n"); task.append("}\n\n"); task.append("if (! defined $opt_g)\n"); task.append("{\n"); task.append(" die \"Gold standard file (-g) missing\\n\" ;\n"); task.append("}\n\n"); task.append("if (! defined $opt_s)\n"); task.append("{\n"); task.append(" die \"System output file (-s) missing\\n\" ;\n"); task.append("}\n\n"); task.append("if (! defined $opt_o)\n"); task.append("{\n"); task.append(" $opt_o = '-' ;\n"); task.append("}\n\n"); task.append("if (defined $opt_q)\n"); task.append("{\n"); task.append(" $short_output = 1 ;\n"); task.append("} else {\n"); task.append(" $short_output = 0 ;\n"); task.append("}\n\n"); task.append("if (defined $opt_p)\n"); task.append("{\n"); task.append(" $score_on_punct = 0 ;\n"); task.append("} else {\n"); task.append(" $score_on_punct = 1 ;\n"); task.append("}\n\n"); task.append("if (defined $opt_d)\n"); task.append("{\n"); task.append(" $score_on_deriv = 0 ;\n"); task.append("} else {\n"); task.append(" $score_on_deriv = 1 ;\n"); task.append("}\n\n"); task.append("$line_num = 0 ;\n"); task.append("$sent_num = 0 ;\n"); task.append("$eof = 0 ;\n\n"); task.append("@err_sent = () ;\n"); task.append("@starts = () ;\n\n"); task.append("%{$err_sent[0]} = () ;\n\n"); task.append("$max_pos_len = length('CPOS') ;\n\n"); task.append("################################################################################\n"); task.append("### reading input ###\n"); task.append("################################################################################\n\n"); task.append("open (GOLD, \"<$opt_g\") || die \"Could not open gold standard file $opt_g\\n\" ;\n"); task.append("open (SYS, \"<$opt_s\") || die \"Could not open system output file $opt_s\\n\" ;\n"); task.append("open (OUT, \">$opt_o\") || die \"Could not open output file $opt_o\\n\" ;\n\n\n"); task.append("if (defined $opt_b) { # produce output similar to evalb\n"); task.append(" print OUT \" Sent. Attachment Correct Scoring \\n\";\n"); task.append(" print OUT \" ID Tokens - Unlab. Lab. HEAD HEAD+DEPREL tokens - - - -\\n\";\n"); task.append(" print OUT \" ============================================================================\\n\";\n"); task.append("}\n\n\n"); task.append("while (! $eof)\n"); task.append("{ # main reading loop\n\n"); task.append(" $starts[$sent_num] = $line_num+1 ;\n"); task.append(" $eof = read_sent(\\@sent_gold, \\@sent_sys) ;\n\n"); task.append(" $sent_num++ ;\n\n"); task.append(" %{$err_sent[$sent_num]} = () ;\n"); task.append(" $word_num = scalar @sent_gold ;\n\n"); task.append(" # for accuracy per sentence\n"); task.append(" my %sent_counts = ( tot => 0,\n"); task.append(" err_any => 0,\n"); task.append(" err_head => 0\n"); task.append(" ); \n\n"); task.append(" # printf \"$sent_num $word_num\\n\" ;\n\n"); task.append(" my @frames_g = ('** '); # the initial frame for the virtual root\n"); task.append(" my @frames_s = ('** '); # the initial frame for the virtual root\n"); task.append(" foreach $i_w (0 .. $word_num-1)\n"); task.append(" { # loop on words\n"); task.append(" push @frames_g, ''; # initialize\n"); task.append(" push @frames_s, ''; # initialize\n"); task.append(" }\n\n"); task.append(" foreach $i_w (0 .. $word_num-1)\n"); task.append(" { # loop on words\n\n"); task.append(" ($word, $pos, $head_g, $dep_g)\n"); task.append(" = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ;\n"); task.append(" $wp = $word.' / '.$pos ;\n\n"); task.append(" # printf \"%d: %s %s %s %s\\n\", $i_w, $word, $pos, $head_g, $dep_g ;\n\n"); task.append(" if ((! $score_on_punct) && is_uni_punct($word))\n"); task.append(" {\n"); task.append(" $counts{punct}++ ;\n"); task.append(" # ignore punctuations\n"); task.append(" next ;\n"); task.append(" }\n\n"); task.append(" if ((! $score_on_deriv) && ($dep_g eq 'DERIV'))\n"); task.append(" {\n"); task.append(" $counts{deriv}++ ;\n"); task.append(" # ignore deriv links\n"); task.append(" next ;\n"); task.append(" }\n\n"); task.append(" if (length($pos) > $max_pos_len)\n"); task.append(" {\n"); task.append(" $max_pos_len = length($pos) ;\n"); task.append(" }\n\n"); task.append(" ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ;\n\n"); task.append(" $counts{tot}++ ;\n"); task.append(" $counts{word}{$wp}{tot}++ ;\n"); task.append(" $counts{pos}{$pos}{tot}++ ;\n"); task.append(" $counts{head}{$head_g-$i_w-1}{tot}++ ;\n\n"); task.append(" # for frame confusions\n"); task.append(" # add child to frame of parent\n"); task.append(" $frames_g[$head_g] .= \"$dep_g \";\n"); task.append(" $frames_s[$head_s] .= \"$dep_s \";\n"); task.append(" # add to frame of token itself\n"); task.append(" $frames_g[$i_w+1] .= \"*$dep_g* \"; # $i_w+1 because $i_w starts counting at zero\n"); task.append(" $frames_s[$i_w+1] .= \"*$dep_g* \";\n\n"); task.append(" # for precision and recall of DEPREL\n"); task.append(" $counts{dep}{$dep_g}{tot}++ ; # counts for gold standard deprels\n"); task.append(" $counts{dep2}{$dep_g}{$dep_s}++ ; # counts for confusions\n"); task.append(" $counts{dep_s}{$dep_s}{tot}++ ; # counts for system deprels\n"); task.append(" $counts{all_dep}{$dep_g} = 1 ; # list of all deprels that occur ...\n"); task.append(" $counts{all_dep}{$dep_s} = 1 ; # ... in either gold or system output\n\n"); task.append(" # for precision and recall of HEAD direction\n"); task.append(" my $dir_g;\n"); task.append(" if ($head_g == 0) {\n"); task.append(" $dir_g = 'to_root';\n"); task.append(" } elsif ($head_g < $i_w+1) { # $i_w+1 because $i_w starts counting at zero\n"); task.append(" # also below\n"); task.append(" $dir_g = 'left';\n"); task.append(" } elsif ($head_g > $i_w+1) {\n"); task.append(" $dir_g = 'right';\n"); task.append(" } else {\n"); task.append(" # token links to itself; should never happen in correct gold standard\n"); task.append(" $dir_g = 'self'; \n"); task.append(" }\n"); task.append(" my $dir_s;\n"); task.append(" if ($head_s == 0) {\n"); task.append(" $dir_s = 'to_root';\n"); task.append(" } elsif ($head_s < $i_w+1) {\n"); task.append(" $dir_s = 'left';\n"); task.append(" } elsif ($head_s > $i_w+1) {\n"); task.append(" $dir_s = 'right';\n"); task.append(" } else {\n"); task.append(" # token links to itself; should not happen in good system \n"); task.append(" # (but not forbidden in shared task)\n"); task.append(" $dir_s = 'self'; \n"); task.append(" }\n"); task.append(" $counts{dir_g}{$dir_g}{tot}++ ; # counts for gold standard head direction\n"); task.append(" $counts{dir2}{$dir_g}{$dir_s}++ ; # counts for confusions\n"); task.append(" $counts{dir_s}{$dir_s}{tot}++ ; # counts for system head direction\n\n"); task.append(" # for precision and recall of HEAD distance\n"); task.append(" my $dist_g;\n"); task.append(" if ($head_g == 0) {\n"); task.append(" $dist_g = 'to_root';\n"); task.append(" } elsif ( abs($head_g - ($i_w+1)) <= 1 ) {\n"); task.append(" $dist_g = '1'; # includes the 'self' cases\n"); task.append(" } elsif ( abs($head_g - ($i_w+1)) <= 2 ) {\n"); task.append(" $dist_g = '2';\n"); task.append(" } elsif ( abs($head_g - ($i_w+1)) <= 6 ) {\n"); task.append(" $dist_g = '3-6';\n"); task.append(" } else {\n"); task.append(" $dist_g = '7-...';\n"); task.append(" }\n"); task.append(" my $dist_s;\n"); task.append(" if ($head_s == 0) {\n"); task.append(" $dist_s = 'to_root';\n"); task.append(" } elsif ( abs($head_s - ($i_w+1)) <= 1 ) {\n"); task.append(" $dist_s = '1'; # includes the 'self' cases\n"); task.append(" } elsif ( abs($head_s - ($i_w+1)) <= 2 ) {\n"); task.append(" $dist_s = '2';\n"); task.append(" } elsif ( abs($head_s - ($i_w+1)) <= 6 ) {\n"); task.append(" $dist_s = '3-6';\n"); task.append(" } else {\n"); task.append(" $dist_s = '7-...';\n"); task.append(" }\n"); task.append(" $counts{dist_g}{$dist_g}{tot}++ ; # counts for gold standard head distance\n"); task.append(" $counts{dist2}{$dist_g}{$dist_s}++ ; # counts for confusions\n"); task.append(" $counts{dist_s}{$dist_s}{tot}++ ; # counts for system head distance\n\n\n"); task.append(" $err_head = ($head_g ne $head_s) ; # error in head\n"); task.append(" $err_dep = ($dep_g ne $dep_s) ; # error in deprel\n\n"); task.append(" $head_err = '-' ;\n"); task.append(" $dep_err = '-' ;\n\n"); task.append(" # for accuracy per sentence\n"); task.append(" $sent_counts{tot}++ ;\n"); task.append(" if ($err_dep || $err_head) {\n"); task.append(" $sent_counts{err_any}++ ;\n"); task.append(" }\n"); task.append(" if ($err_head) {\n"); task.append(" $sent_counts{err_head}++ ;\n"); task.append(" }\n\n"); task.append(" # total counts and counts for CPOS involved in errors\n\n"); task.append(" if ($head_g eq '0')\n"); task.append(" {\n"); task.append(" $head_aft_bef_g = '0' ;\n"); task.append(" }\n"); task.append(" elsif ($head_g eq $i_w+1)\n"); task.append(" {\n"); task.append(" $head_aft_bef_g = 'e' ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ;\n"); task.append(" }\n\n"); task.append(" if ($head_s eq '0')\n"); task.append(" {\n"); task.append(" $head_aft_bef_s = '0' ;\n"); task.append(" }\n"); task.append(" elsif ($head_s eq $i_w+1)\n"); task.append(" {\n"); task.append(" $head_aft_bef_s = 'e' ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ;\n"); task.append(" }\n\n"); task.append(" $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ;\n\n"); task.append(" if ($err_head)\n"); task.append(" {\n"); task.append(" if ($head_aft_bef_s eq '0')\n"); task.append(" {\n"); task.append(" $head_err = 0 ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $head_err = $head_s-$head_g ;\n"); task.append(" }\n\n"); task.append(" $err_sent[$sent_num]{head}++ ;\n"); task.append(" $counts{err_head}{tot}++ ;\n"); task.append(" $counts{err_head}{$head_err}++ ;\n\n"); task.append(" $counts{word}{err_head}{$wp}++ ;\n"); task.append(" $counts{pos}{$pos}{err_head}{tot}++ ;\n"); task.append(" $counts{pos}{$pos}{err_head}{$head_err}++ ;\n"); task.append(" }\n\n"); task.append(" if ($err_dep)\n"); task.append(" {\n"); task.append(" $dep_err = $dep_g.'->'.$dep_s ;\n"); task.append(" $err_sent[$sent_num]{dep}++ ;\n"); task.append(" $counts{err_dep}{tot}++ ;\n"); task.append(" $counts{err_dep}{$dep_err}++ ;\n\n"); task.append(" $counts{word}{err_dep}{$wp}++ ;\n"); task.append(" $counts{pos}{$pos}{err_dep}{tot}++ ;\n"); task.append(" $counts{pos}{$pos}{err_dep}{$dep_err}++ ;\n\n"); task.append(" if ($err_head)\n"); task.append(" {\n"); task.append(" $counts{err_both}++ ;\n"); task.append(" $counts{pos}{$pos}{err_both}++ ;\n"); task.append(" }\n"); task.append(" }\n\n"); task.append(" ### DEPREL + ATTACHMENT\n"); task.append(" if ((!$err_dep) && ($err_head)) {\n"); task.append(" $counts{err_head_corr_dep}{tot}++ ;\n"); task.append(" $counts{err_head_corr_dep}{$dep_s}++ ;\n"); task.append(" }\n"); task.append(" ### DEPREL + ATTACHMENT\n\n"); task.append(" # counts for words involved in errors\n\n"); task.append(" if (! ($err_head || $err_dep))\n"); task.append(" {\n"); task.append(" next ;\n"); task.append(" }\n\n"); task.append(" $err_sent[$sent_num]{word}++ ;\n"); task.append(" $counts{err_any}++ ;\n"); task.append(" $counts{word}{err_any}{$wp}++ ;\n"); task.append(" $counts{pos}{$pos}{err_any}++ ;\n\n"); task.append(" ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\\@sent_gold, $i_w) ;\n\n"); task.append(" if ($w_2 ne $START)\n"); task.append(" {\n"); task.append(" $wp_2 = $w_2.' / '.$p_2 ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $wp_2 = $w_2 ;\n"); task.append(" }\n\n"); task.append(" if ($w_1 ne $START)\n"); task.append(" {\n"); task.append(" $wp_1 = $w_1.' / '.$p_1 ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $wp_1 = $w_1 ;\n"); task.append(" }\n\n"); task.append(" if ($w1 ne $END)\n"); task.append(" {\n"); task.append(" $wp1 = $w1.' / '.$p1 ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $wp1 = $w1 ;\n"); task.append(" }\n\n"); task.append(" if ($w2 ne $END)\n"); task.append(" {\n"); task.append(" $wp2 = $w2.' / '.$p2 ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $wp2 = $w2 ;\n"); task.append(" }\n\n"); task.append(" $con_bef = $wp_1 ;\n"); task.append(" $con_bef_2 = $wp_2.' + '.$wp_1 ;\n"); task.append(" $con_aft = $wp1 ;\n"); task.append(" $con_aft_2 = $wp1.' + '.$wp2 ;\n\n"); task.append(" $con_pos_bef = $p_1 ;\n"); task.append(" $con_pos_bef_2 = $p_2.'+'.$p_1 ;\n"); task.append(" $con_pos_aft = $p1 ;\n"); task.append(" $con_pos_aft_2 = $p1.'+'.$p2 ;\n\n"); task.append(" if ($w_1 ne $START)\n"); task.append(" {\n"); task.append(" # do not count '.S' as a word context\n"); task.append(" $counts{con_bef_2}{tot}{$con_bef_2}++ ;\n"); task.append(" $counts{con_bef_2}{err_head}{$con_bef_2} += $err_head ;\n"); task.append(" $counts{con_bef_2}{err_dep}{$con_bef_2} += $err_dep ;\n"); task.append(" $counts{con_bef}{tot}{$con_bef}++ ;\n"); task.append(" $counts{con_bef}{err_head}{$con_bef} += $err_head ;\n"); task.append(" $counts{con_bef}{err_dep}{$con_bef} += $err_dep ;\n"); task.append(" }\n\n"); task.append(" if ($w1 ne $END)\n"); task.append(" {\n"); task.append(" # do not count '.E' as a word context\n"); task.append(" $counts{con_aft_2}{tot}{$con_aft_2}++ ;\n"); task.append(" $counts{con_aft_2}{err_head}{$con_aft_2} += $err_head ;\n"); task.append(" $counts{con_aft_2}{err_dep}{$con_aft_2} += $err_dep ;\n"); task.append(" $counts{con_aft}{tot}{$con_aft}++ ;\n"); task.append(" $counts{con_aft}{err_head}{$con_aft} += $err_head ;\n"); task.append(" $counts{con_aft}{err_dep}{$con_aft} += $err_dep ;\n"); task.append(" }\n\n"); task.append(" $counts{con_pos_bef_2}{tot}{$con_pos_bef_2}++ ;\n"); task.append(" $counts{con_pos_bef_2}{err_head}{$con_pos_bef_2} += $err_head ;\n"); task.append(" $counts{con_pos_bef_2}{err_dep}{$con_pos_bef_2} += $err_dep ;\n"); task.append(" $counts{con_pos_bef}{tot}{$con_pos_bef}++ ;\n"); task.append(" $counts{con_pos_bef}{err_head}{$con_pos_bef} += $err_head ;\n"); task.append(" $counts{con_pos_bef}{err_dep}{$con_pos_bef} += $err_dep ;\n\n"); task.append(" $counts{con_pos_aft_2}{tot}{$con_pos_aft_2}++ ;\n"); task.append(" $counts{con_pos_aft_2}{err_head}{$con_pos_aft_2} += $err_head ;\n"); task.append(" $counts{con_pos_aft_2}{err_dep}{$con_pos_aft_2} += $err_dep ;\n"); task.append(" $counts{con_pos_aft}{tot}{$con_pos_aft}++ ;\n"); task.append(" $counts{con_pos_aft}{err_head}{$con_pos_aft} += $err_head ;\n"); task.append(" $counts{con_pos_aft}{err_dep}{$con_pos_aft} += $err_dep ;\n\n"); task.append(" $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ;\n"); task.append(" $freq_err{$err}++ ;\n\n"); task.append(" } # loop on words\n\n"); task.append(" foreach $i_w (0 .. $word_num) # including one for the virtual root\n"); task.append(" { # loop on words\n"); task.append(" if ($frames_g[$i_w] ne $frames_s[$i_w]) {\n"); task.append(" $counts{frame2}{\"$frames_g[$i_w]/ $frames_s[$i_w]\"}++ ;\n"); task.append(" }\n"); task.append(" }\n\n"); task.append(" if (defined $opt_b) { # produce output similar to evalb\n"); task.append(" if ($word_num > 0) {\n"); task.append(" my ($unlabeled,$labeled) = ('NaN', 'NaN');\n"); task.append(" if ($sent_counts{tot} > 0) { # there are scoring tokens\n"); task.append(" $unlabeled = 100-$sent_counts{err_head}*100.0/$sent_counts{tot};\n"); task.append(" $labeled = 100-$sent_counts{err_any} *100.0/$sent_counts{tot};\n"); task.append(" }\n"); task.append(" printf OUT \" %4d %4d 0 %6.2f %6.2f %4d %4d %4d 0 0 0 0\\n\", \n"); task.append(" $sent_num, $word_num, \n"); task.append(" $unlabeled, $labeled, \n"); task.append(" $sent_counts{tot}-$sent_counts{err_head}, \n"); task.append(" $sent_counts{tot}-$sent_counts{err_any}, \n"); task.append(" $sent_counts{tot},;\n"); task.append(" }\n"); task.append(" }\n\n"); task.append("} # main reading loop\n\n"); task.append("################################################################################\n"); task.append("### printing output ###\n"); task.append("################################################################################\n\n"); task.append("if (defined $opt_b) { # produce output similar to evalb\n"); task.append(" print OUT \"\\n\\n\";\n"); task.append("}\n"); task.append("printf OUT \" Labeled attachment score: %d / %d * 100 = %.2f %%\\n\", \n"); task.append(" $counts{tot}-$counts{err_any}, $counts{tot}, 100-$counts{err_any}*100.0/$counts{tot} ;\n"); task.append("printf OUT \" Unlabeled attachment score: %d / %d * 100 = %.2f %%\\n\", \n"); task.append(" $counts{tot}-$counts{err_head}{tot}, $counts{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot} ;\n"); task.append("printf OUT \" Label accuracy score: %d / %d * 100 = %.2f %%\\n\", \n"); task.append(" $counts{tot}-$counts{err_dep}{tot}, $counts{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot} ;\n\n"); task.append("if ($short_output)\n"); task.append("{\n"); task.append(" exit(0) ;\n"); task.append("}\n"); task.append("printf OUT \"\\n %s\\n\\n\", '=' x 80 ;\n"); task.append("printf OUT \" Evaluation of the results in %s\\n vs. gold standard %s:\\n\\n\", $opt_s, $opt_g ;\n\n"); task.append("printf OUT \" Legend: '%s' - the beginning of a sentence, '%s' - the end of a sentence\\n\\n\", $START, $END ;\n\n"); task.append("printf OUT \" Number of non-scoring tokens: \" . ($counts{deriv} + $counts{punct}) . \"\\n\\n\";\n\n"); task.append("printf OUT \" The overall accuracy and its distribution over CPOSTAGs\\n\\n\" ;\n"); task.append("printf OUT \"%s\\n\", \" -----------+-------+-------+------+-------+------+-------+-------\" ;\n\n"); task.append("printf OUT \" %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\\n\",\n"); task.append(" 'Accuracy', 'words', 'right', 'right', 'both' ;\n"); task.append("printf OUT \" %-10s | %-5s | %-5s | | %-5s | | %-5s |\\n\",\n"); task.append(" ' ', ' ', 'head', ' dep', 'right' ;\n\n"); task.append("printf OUT \"%s\\n\", \" -----------+-------+-------+------+-------+------+-------+-------\" ;\n\n"); task.append("printf OUT \" %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\\n\",\n"); task.append(" 'total', $counts{tot},\n"); task.append(" $counts{tot}-$counts{err_head}{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot},\n"); task.append(" $counts{tot}-$counts{err_dep}{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot},\n"); task.append(" $counts{tot}-$counts{err_any}, 100-$counts{err_any}*100.0/$counts{tot} ;\n\n"); task.append("printf OUT \"%s\\n\", \" -----------+-------+-------+------+-------+------+-------+-------\" ;\n\n"); task.append("foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}})\n"); task.append("{\n"); task.append(" if (! defined($counts{pos}{$pos}{err_head}{tot}))\n"); task.append(" {\n"); task.append(" $counts{pos}{$pos}{err_head}{tot} = 0 ;\n"); task.append(" }\n"); task.append(" if (! defined($counts{pos}{$pos}{err_dep}{tot}))\n"); task.append(" {\n"); task.append(" $counts{pos}{$pos}{err_dep}{tot} = 0 ;\n"); task.append(" }\n"); task.append(" if (! defined($counts{pos}{$pos}{err_any}))\n"); task.append(" {\n"); task.append(" $counts{pos}{$pos}{err_any} = 0 ;\n"); task.append(" }\n\n"); task.append(" printf OUT \" %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\\n\",\n"); task.append(" $pos, $counts{pos}{$pos}{tot},\n"); task.append(" $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_head}{tot}, 100-$counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot},\n"); task.append(" $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_dep}{tot}, 100-$counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot},\n"); task.append(" $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_any}, 100-$counts{pos}{$pos}{err_any}*100.0/$counts{pos}{$pos}{tot} ;\n"); task.append("}\n\n"); task.append("printf OUT \"%s\\n\", \" -----------+-------+-------+------+-------+------+-------+-------\" ;\n\n"); task.append("printf OUT \"\\n\\n\" ;\n\n"); task.append("printf OUT \" The overall error rate and its distribution over CPOSTAGs\\n\\n\" ;\n"); task.append("printf OUT \"%s\\n\", \" -----------+-------+-------+------+-------+------+-------+-------\" ;\n\n"); task.append("printf OUT \" %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\\n\",\n"); task.append(" 'Error', 'words', 'head', ' dep', 'both' ;\n"); task.append("printf OUT \" %-10s | %-5s | %-5s | | %-5s | | %-5s |\\n\",\n\n"); task.append(" 'Rate', ' ', 'err', ' err', 'wrong' ;\n\n"); task.append("printf OUT \"%s\\n\", \" -----------+-------+-------+------+-------+------+-------+-------\" ;\n\n"); task.append("printf OUT \" %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\\n\",\n"); task.append(" 'total', $counts{tot},\n"); task.append(" $counts{err_head}{tot}, $counts{err_head}{tot}*100.0/$counts{tot},\n"); task.append(" $counts{err_dep}{tot}, $counts{err_dep}{tot}*100.0/$counts{tot},\n"); task.append(" $counts{err_both}, $counts{err_both}*100.0/$counts{tot} ;\n\n"); task.append("printf OUT \"%s\\n\", \" -----------+-------+-------+------+-------+------+-------+-------\" ;\n\n"); task.append("foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}})\n"); task.append("{\n"); task.append(" if (! defined($counts{pos}{$pos}{err_both}))\n"); task.append(" {\n"); task.append(" $counts{pos}{$pos}{err_both} = 0 ;\n"); task.append(" }\n\n"); task.append(" printf OUT \" %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\\n\",\n"); task.append(" $pos, $counts{pos}{$pos}{tot},\n"); task.append(" $counts{pos}{$pos}{err_head}{tot}, $counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot},\n"); task.append(" $counts{pos}{$pos}{err_dep}{tot}, $counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot},\n"); task.append(" $counts{pos}{$pos}{err_both}, $counts{pos}{$pos}{err_both}*100.0/$counts{pos}{$pos}{tot} ;\n\n"); task.append("}\n\n"); task.append("printf OUT \"%s\\n\", \" -----------+-------+-------+------+-------+------+-------+-------\" ;\n\n"); task.append("### added by Sabine Buchholz\n"); task.append("printf OUT \"\\n\\n\";\n"); task.append("printf OUT \" Precision and recall of DEPREL\\n\\n\";\n"); task.append("printf OUT \" ----------------+------+---------+--------+------------+---------------\\n\";\n"); task.append("printf OUT \" deprel | gold | correct | system | recall (%%) | precision (%%) \\n\";\n"); task.append("printf OUT \" ----------------+------+---------+--------+------------+---------------\\n\";\n"); task.append("foreach my $dep (sort keys %{$counts{all_dep}}) {\n"); task.append(" # initialize\n"); task.append(" my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');\n\n"); task.append(" if (defined($counts{dep2}{$dep}{$dep})) {\n"); task.append(" $tot_corr = $counts{dep2}{$dep}{$dep};\n"); task.append(" } \n"); task.append(" if (defined($counts{dep}{$dep}{tot})) {\n"); task.append(" $tot_g = $counts{dep}{$dep}{tot};\n"); task.append(" $rec = sprintf(\"%.2f\",$tot_corr / $tot_g * 100);\n"); task.append(" }\n"); task.append(" if (defined($counts{dep_s}{$dep}{tot})) {\n"); task.append(" $tot_s = $counts{dep_s}{$dep}{tot};\n"); task.append(" $prec = sprintf(\"%.2f\",$tot_corr / $tot_s * 100);\n"); task.append(" }\n"); task.append(" printf OUT \" %-15s | %4d | %7d | %6d | %10s | %13s\\n\",\n"); task.append(" $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec;\n"); task.append("}\n\n"); task.append("### DEPREL + ATTACHMENT:\n"); task.append("### Same as Sabine's DEPREL apart from $tot_corr calculation\n"); task.append("printf OUT \"\\n\\n\";\n"); task.append("printf OUT \" Precision and recall of DEPREL + ATTACHMENT\\n\\n\";\n"); task.append("printf OUT \" ----------------+------+---------+--------+------------+---------------\\n\";\n"); task.append("printf OUT \" deprel | gold | correct | system | recall (%%) | precision (%%) \\n\";\n"); task.append("printf OUT \" ----------------+------+---------+--------+------------+---------------\\n\";\n"); task.append("foreach my $dep (sort keys %{$counts{all_dep}}) {\n"); task.append(" # initialize\n"); task.append(" my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');\n\n"); task.append(" if (defined($counts{dep2}{$dep}{$dep})) {\n"); task.append(" if (defined($counts{err_head_corr_dep}{$dep})) {\n"); task.append(" $tot_corr = $counts{dep2}{$dep}{$dep} - $counts{err_head_corr_dep}{$dep};\n"); task.append(" } else {\n"); task.append(" $tot_corr = $counts{dep2}{$dep}{$dep};\n"); task.append(" }\n"); task.append(" } \n"); task.append(" if (defined($counts{dep}{$dep}{tot})) {\n"); task.append(" $tot_g = $counts{dep}{$dep}{tot};\n"); task.append(" $rec = sprintf(\"%.2f\",$tot_corr / $tot_g * 100);\n"); task.append(" }\n"); task.append(" if (defined($counts{dep_s}{$dep}{tot})) {\n"); task.append(" $tot_s = $counts{dep_s}{$dep}{tot};\n"); task.append(" $prec = sprintf(\"%.2f\",$tot_corr / $tot_s * 100);\n"); task.append(" }\n"); task.append(" printf OUT \" %-15s | %4d | %7d | %6d | %10s | %13s\\n\",\n"); task.append(" $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec;\n"); task.append("}\n"); task.append("### DEPREL + ATTACHMENT\n"); task.append("\n"); task.append("printf OUT \"\\n\\n\";\n"); task.append("printf OUT \" Precision and recall of binned HEAD direction\\n\\n\";\n"); task.append("printf OUT \" ----------------+------+---------+--------+------------+---------------\\n\";\n"); task.append("printf OUT \" direction | gold | correct | system | recall (%%) | precision (%%) \\n\";\n"); task.append("printf OUT \" ----------------+------+---------+--------+------------+---------------\\n\";\n"); task.append("foreach my $dir ('to_root', 'left', 'right', 'self') {\n"); task.append(" # initialize\n"); task.append(" my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');\n\n"); task.append(" if (defined($counts{dir2}{$dir}{$dir})) {\n"); task.append(" $tot_corr = $counts{dir2}{$dir}{$dir};\n"); task.append(" } \n"); task.append(" if (defined($counts{dir_g}{$dir}{tot})) {\n"); task.append(" $tot_g = $counts{dir_g}{$dir}{tot};\n"); task.append(" $rec = sprintf(\"%.2f\",$tot_corr / $tot_g * 100);\n"); task.append(" }\n"); task.append(" if (defined($counts{dir_s}{$dir}{tot})) {\n"); task.append(" $tot_s = $counts{dir_s}{$dir}{tot};\n"); task.append(" $prec = sprintf(\"%.2f\",$tot_corr / $tot_s * 100);\n"); task.append(" }\n"); task.append(" printf OUT \" %-15s | %4d | %7d | %6d | %10s | %13s\\n\",\n"); task.append(" $dir, $tot_g, $tot_corr, $tot_s, $rec, $prec;\n"); task.append("}\n\n"); task.append("printf OUT \"\\n\\n\";\n"); task.append("printf OUT \" Precision and recall of binned HEAD distance\\n\\n\";\n"); task.append("printf OUT \" ----------------+------+---------+--------+------------+---------------\\n\";\n"); task.append("printf OUT \" distance | gold | correct | system | recall (%%) | precision (%%) \\n\";\n"); task.append("printf OUT \" ----------------+------+---------+--------+------------+---------------\\n\";\n"); task.append("foreach my $dist ('to_root', '1', '2', '3-6', '7-...') {\n"); task.append(" # initialize\n"); task.append(" my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');\n\n"); task.append(" if (defined($counts{dist2}{$dist}{$dist})) {\n"); task.append(" $tot_corr = $counts{dist2}{$dist}{$dist};\n"); task.append(" } \n"); task.append(" if (defined($counts{dist_g}{$dist}{tot})) {\n"); task.append(" $tot_g = $counts{dist_g}{$dist}{tot};\n"); task.append(" $rec = sprintf(\"%.2f\",$tot_corr / $tot_g * 100);\n"); task.append(" }\n"); task.append(" if (defined($counts{dist_s}{$dist}{tot})) {\n"); task.append(" $tot_s = $counts{dist_s}{$dist}{tot};\n"); task.append(" $prec = sprintf(\"%.2f\",$tot_corr / $tot_s * 100);\n"); task.append(" }\n"); task.append(" printf OUT \" %-15s | %4d | %7d | %6d | %10s | %13s\\n\",\n"); task.append(" $dist, $tot_g, $tot_corr, $tot_s, $rec, $prec;\n"); task.append("}\n\n"); task.append("printf OUT \"\\n\\n\";\n"); task.append("printf OUT \" Frame confusions (gold versus system; *...* marks the head token)\\n\\n\";\n"); task.append("foreach my $frame (sort {$counts{frame2}{$b} <=> $counts{frame2}{$a}} keys %{$counts{frame2}})\n"); task.append("{\n"); task.append(" if ($counts{frame2}{$frame} >= 5) # (make 5 a changeable threshold later)\n"); task.append(" {\n"); task.append(" printf OUT \" %3d %s\\n\", $counts{frame2}{$frame}, $frame;\n"); task.append(" }\n"); task.append("}\n"); task.append("### end of: added by Sabine Buchholz\n\n\n"); task.append("#\n"); task.append("# Leave only the 5 words mostly involved in errors\n"); task.append("#\n\n\n"); task.append("$thresh = (sort {$b <=> $a} values %{$counts{word}{err_any}})[4] ;\n\n"); task.append("# ensure enough space for title\n"); task.append("$max_word_len = length('word') ;\n\n"); task.append("foreach $word (keys %{$counts{word}{err_any}})\n"); task.append("{\n"); task.append(" if ($counts{word}{err_any}{$word} < $thresh)\n"); task.append(" {\n"); task.append(" delete $counts{word}{err_any}{$word} ;\n"); task.append(" next ;\n"); task.append(" }\n\n"); task.append(" $l = uni_len($word) ;\n"); task.append(" if ($l > $max_word_len)\n"); task.append(" {\n"); task.append(" $max_word_len = $l ;\n"); task.append(" }\n"); task.append("}\n\n"); task.append("# filter a case when the difference between the error counts\n"); task.append("# for 2-word and 1-word contexts is small\n"); task.append("# (leave the 2-word context)\n\n"); task.append("foreach $con (keys %{$counts{con_aft_2}{tot}})\n"); task.append("{\n"); task.append(" ($w1) = split(/\\+/, $con) ;\n\n"); task.append(" if (defined $counts{con_aft}{tot}{$w1} &&\n"); task.append(" $counts{con_aft}{tot}{$w1}-$counts{con_aft_2}{tot}{$con} <= 1)\n"); task.append(" {\n"); task.append(" delete $counts{con_aft}{tot}{$w1} ;\n"); task.append(" }\n"); task.append("}\n\n"); task.append("foreach $con (keys %{$counts{con_bef_2}{tot}})\n"); task.append("{\n"); task.append(" ($w_2, $w_1) = split(/\\+/, $con) ;\n\n"); task.append(" if (defined $counts{con_bef}{tot}{$w_1} &&\n"); task.append(" $counts{con_bef}{tot}{$w_1}-$counts{con_bef_2}{tot}{$con} <= 1)\n"); task.append(" {\n"); task.append(" delete $counts{con_bef}{tot}{$w_1} ;\n"); task.append(" }\n"); task.append("}\n\n"); task.append("foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}})\n"); task.append("{\n"); task.append(" ($p1) = split(/\\+/, $con_pos) ;\n\n"); task.append(" if (defined($counts{con_pos_aft}{tot}{$p1}) &&\n"); task.append(" $counts{con_pos_aft}{tot}{$p1}-$counts{con_pos_aft_2}{tot}{$con_pos} <= 1)\n"); task.append(" {\n"); task.append(" delete $counts{con_pos_aft}{tot}{$p1} ;\n"); task.append(" }\n"); task.append("}\n\n"); task.append("foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}})\n"); task.append("{\n"); task.append(" ($p_2, $p_1) = split(/\\+/, $con_pos) ;\n\n"); task.append(" if (defined($counts{con_pos_bef}{tot}{$p_1}) &&\n"); task.append(" $counts{con_pos_bef}{tot}{$p_1}-$counts{con_pos_bef_2}{tot}{$con_pos} <= 1)\n"); task.append(" {\n"); task.append(" delete $counts{con_pos_bef}{tot}{$p_1} ;\n"); task.append(" }\n"); task.append("}\n\n"); task.append("# for each context type, take the three contexts most involved in errors\n\n"); task.append("$max_con_len = 0 ;\n\n"); task.append("filter_context_counts($counts{con_bef_2}{tot}, $con_err_num, \\$max_con_len) ;\n\n"); task.append("filter_context_counts($counts{con_bef}{tot}, $con_err_num, \\$max_con_len) ;\n\n"); task.append("filter_context_counts($counts{con_aft}{tot}, $con_err_num, \\$max_con_len) ;\n\n"); task.append("filter_context_counts($counts{con_aft_2}{tot}, $con_err_num, \\$max_con_len) ;\n\n"); task.append("# for each CPOS context type, take the three CPOS contexts most involved in errors\n\n"); task.append("$max_con_pos_len = 0 ;\n\n"); task.append("$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef_2}{tot}})[$con_err_num-1] ;\n\n"); task.append("foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}})\n"); task.append("{\n"); task.append(" if ($counts{con_pos_bef_2}{tot}{$con_pos} < $thresh)\n"); task.append(" {\n"); task.append(" delete $counts{con_pos_bef_2}{tot}{$con_pos} ;\n"); task.append(" next ;\n"); task.append(" }\n"); task.append(" if (length($con_pos) > $max_con_pos_len)\n"); task.append(" {\n"); task.append(" $max_con_pos_len = length($con_pos) ;\n"); task.append(" }\n"); task.append("}\n\n"); task.append("$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef}{tot}})[$con_err_num-1] ;\n\n"); task.append("foreach $con_pos (keys %{$counts{con_pos_bef}{tot}})\n"); task.append("{\n"); task.append(" if ($counts{con_pos_bef}{tot}{$con_pos} < $thresh)\n"); task.append(" {\n"); task.append(" delete $counts{con_pos_bef}{tot}{$con_pos} ;\n"); task.append(" next ;\n"); task.append(" }\n"); task.append(" if (length($con_pos) > $max_con_pos_len)\n"); task.append(" {\n"); task.append(" $max_con_pos_len = length($con_pos) ;\n"); task.append(" }\n"); task.append("}\n"); task.append("\n"); task.append("$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft}{tot}})[$con_err_num-1] ;\n\n"); task.append("foreach $con_pos (keys %{$counts{con_pos_aft}{tot}})\n"); task.append("{\n"); task.append(" if ($counts{con_pos_aft}{tot}{$con_pos} < $thresh)\n"); task.append(" {\n"); task.append(" delete $counts{con_pos_aft}{tot}{$con_pos} ;\n"); task.append(" next ;\n"); task.append(" }\n"); task.append(" if (length($con_pos) > $max_con_pos_len)\n"); task.append(" {\n"); task.append(" $max_con_pos_len = length($con_pos) ;\n"); task.append(" }\n"); task.append("}\n\n"); task.append("$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft_2}{tot}})[$con_err_num-1] ;\n\n"); task.append("foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}})\n"); task.append("{\n"); task.append(" if ($counts{con_pos_aft_2}{tot}{$con_pos} < $thresh)\n"); task.append(" {\n"); task.append(" delete $counts{con_pos_aft_2}{tot}{$con_pos} ;\n"); task.append(" next ;\n"); task.append(" }\n"); task.append(" if (length($con_pos) > $max_con_pos_len)\n"); task.append(" {\n"); task.append(" $max_con_pos_len = length($con_pos) ;\n"); task.append(" }\n"); task.append("}\n\n"); task.append("# printing\n\n"); task.append("# ------------- focus words\n\n"); task.append("printf OUT \"\\n\\n\" ;\n"); task.append("printf OUT \" %d focus words where most of the errors occur:\\n\\n\", scalar keys %{$counts{word}{err_any}} ;\n\n"); task.append("printf OUT \" %-*s | %-4s | %-4s | %-4s | %-4s\\n\", $max_word_len, ' ', 'any', 'head', 'dep', 'both' ;\n"); task.append("printf OUT \" %s-+------+------+------+------\\n\", '-' x $max_word_len;\n\n"); task.append("foreach $word (sort {$counts{word}{err_any}{$b} <=> $counts{word}{err_any}{$a}} keys %{$counts{word}{err_any}})\n"); task.append("{\n"); task.append(" if (!defined($counts{word}{err_head}{$word}))\n"); task.append(" {\n"); task.append(" $counts{word}{err_head}{$word} = 0 ;\n"); task.append(" }\n"); task.append(" if (! defined($counts{word}{err_dep}{$word}))\n"); task.append(" {\n"); task.append(" $counts{word}{err_dep}{$word} = 0 ;\n"); task.append(" }\n"); task.append(" if (! defined($counts{word}{err_any}{$word}))\n"); task.append(" {\n"); task.append(" $counts{word}{err_any}{$word} = 0;\n"); task.append(" }\n"); task.append(" printf OUT \" %-*s | %4d | %4d | %4d | %4d\\n\",\n"); task.append(" $max_word_len+length($word)-uni_len($word), $word, $counts{word}{err_any}{$word},\n"); task.append(" $counts{word}{err_head}{$word},\n"); task.append(" $counts{word}{err_dep}{$word},\n"); task.append(" $counts{word}{err_dep}{$word}+$counts{word}{err_head}{$word}-$counts{word}{err_any}{$word} ;\n"); task.append("}\n\n"); task.append("printf OUT \" %s-+------+------+------+------\\n\", '-' x $max_word_len;\n\n"); task.append("# ------------- contexts\n\n"); task.append("printf OUT \"\\n\\n\" ;\n\n"); task.append("printf OUT \" one-token preceeding contexts where most of the errors occur:\\n\\n\" ;\n\n"); task.append("print_context($counts{con_bef}, $counts{con_pos_bef}, $max_con_len, $max_con_pos_len) ;\n\n"); task.append("printf OUT \" two-token preceeding contexts where most of the errors occur:\\n\\n\" ;\n\n"); task.append("print_context($counts{con_bef_2}, $counts{con_pos_bef_2}, $max_con_len, $max_con_pos_len) ;\n\n"); task.append("printf OUT \" one-token following contexts where most of the errors occur:\\n\\n\" ;\n\n"); task.append("print_context($counts{con_aft}, $counts{con_pos_aft}, $max_con_len, $max_con_pos_len) ;\n\n"); task.append("printf OUT \" two-token following contexts where most of the errors occur:\\n\\n\" ;\n\n"); task.append("print_context($counts{con_aft_2}, $counts{con_pos_aft_2}, $max_con_len, $max_con_pos_len) ;\n\n"); task.append("# ------------- Sentences\n\n"); task.append("printf OUT \" Sentence with the highest number of word errors:\\n\" ;\n"); task.append("$i = (sort { (defined($err_sent[$b]{word}) && $err_sent[$b]{word})\n"); task.append(" <=> (defined($err_sent[$a]{word}) && $err_sent[$a]{word}) } 1 .. $sent_num)[0] ;\n"); task.append("printf OUT \" Sentence %d line %d, \", $i, $starts[$i-1] ;\n"); task.append("printf OUT \"%d head errors, %d dependency errors, %d word errors\\n\",\n"); task.append(" $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;\n\n"); task.append("printf OUT \"\\n\\n\" ;\n\n"); task.append("printf OUT \" Sentence with the highest number of head errors:\\n\" ;\n"); task.append("$i = (sort { (defined($err_sent[$b]{head}) && $err_sent[$b]{head}) \n"); task.append(" <=> (defined($err_sent[$a]{head}) && $err_sent[$a]{head}) } 1 .. $sent_num)[0] ;\n"); task.append("printf OUT \" Sentence %d line %d, \", $i, $starts[$i-1] ;\n"); task.append("printf OUT \"%d head errors, %d dependency errors, %d word errors\\n\",\n"); task.append(" $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;\n\n"); task.append("printf OUT \"\\n\\n\" ;\n\n"); task.append("printf OUT \" Sentence with the highest number of dependency errors:\\n\" ;\n"); task.append("$i = (sort { (defined($err_sent[$b]{dep}) && $err_sent[$b]{dep}) \n"); task.append(" <=> (defined($err_sent[$a]{dep}) && $err_sent[$a]{dep}) } 1 .. $sent_num)[0] ;\n"); task.append("printf OUT \" Sentence %d line %d, \", $i, $starts[$i-1] ;\n"); task.append("printf OUT \"%d head errors, %d dependency errors, %d word errors\\n\",\n"); task.append(" $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;\n\n"); task.append("#\n"); task.append("# Second pass, collect statistics of the frequent errors\n"); task.append("#\n\n"); task.append("# filter the errors, leave the most frequent $freq_err_num errors\n\n"); task.append("$i = 0 ;\n\n"); task.append("$thresh = (sort {$b <=> $a} values %freq_err)[$freq_err_num-1] ;\n\n"); task.append("foreach $err (keys %freq_err)\n"); task.append("{\n"); task.append(" if ($freq_err{$err} < $thresh)\n"); task.append(" {\n"); task.append(" delete $freq_err{$err} ;\n"); task.append(" }\n"); task.append("}\n\n"); task.append("# in case there are several errors with the threshold count\n\n"); task.append("$freq_err_num = scalar keys %freq_err ;\n\n"); task.append("%err_counts = () ;\n\n"); task.append("$eof = 0 ;\n\n"); task.append("seek (GOLD, 0, 0) ;\n"); task.append("seek (SYS, 0, 0) ;\n\n"); task.append("while (! $eof)\n"); task.append("{ # second reading loop\n\n"); task.append(" $eof = read_sent(\\@sent_gold, \\@sent_sys) ;\n"); task.append(" $sent_num++ ;\n\n"); task.append(" $word_num = scalar @sent_gold ;\n\n"); task.append(" # printf \"$sent_num $word_num\\n\" ;\n\n"); task.append(" foreach $i_w (0 .. $word_num-1)\n"); task.append(" { # loop on words\n"); task.append(" ($word, $pos, $head_g, $dep_g)\n"); task.append(" = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ;\n\n"); task.append(" # printf \"%d: %s %s %s %s\\n\", $i_w, $word, $pos, $head_g, $dep_g ;\n\n"); task.append(" if ((! $score_on_punct) && is_uni_punct($word))\n"); task.append(" {\n"); task.append(" # ignore punctuations\n"); task.append(" next ;\n"); task.append(" }\n\n"); task.append(" if ((! $score_on_deriv) && ($dep_g eq 'DERIV'))\n"); task.append(" {\n"); task.append(" # ignore deriv\n"); task.append(" next ;\n"); task.append(" }\n\n"); task.append(" ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ;\n\n"); task.append(" $err_head = ($head_g ne $head_s) ;\n"); task.append(" $err_dep = ($dep_g ne $dep_s) ;\n\n"); task.append(" $head_err = '-' ;\n"); task.append(" $dep_err = '-' ;\n\n"); task.append(" if ($head_g eq '0')\n"); task.append(" {\n"); task.append(" $head_aft_bef_g = '0' ;\n"); task.append(" }\n"); task.append(" elsif ($head_g eq $i_w+1)\n"); task.append(" {\n"); task.append(" $head_aft_bef_g = 'e' ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ;\n"); task.append(" }\n\n"); task.append(" if ($head_s eq '0')\n"); task.append(" {\n"); task.append(" $head_aft_bef_s = '0' ;\n"); task.append(" }\n"); task.append(" elsif ($head_s eq $i_w+1)\n"); task.append(" {\n"); task.append(" $head_aft_bef_s = 'e' ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ;\n"); task.append(" }\n\n"); task.append(" $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ;\n\n"); task.append(" if ($err_head)\n"); task.append(" {\n"); task.append(" if ($head_aft_bef_s eq '0')\n"); task.append(" {\n"); task.append(" $head_err = 0 ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $head_err = $head_s-$head_g ;\n"); task.append(" }\n"); task.append(" }\n\n"); task.append(" if ($err_dep)\n"); task.append(" {\n"); task.append(" $dep_err = $dep_g.'->'.$dep_s ;\n"); task.append(" }\n\n"); task.append(" if (! ($err_head || $err_dep))\n"); task.append(" {\n"); task.append(" next ;\n"); task.append(" }\n\n"); task.append(" # handle only the most frequent errors\n\n"); task.append(" $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ;\n\n"); task.append(" if (! exists $freq_err{$err})\n"); task.append(" {\n"); task.append(" next ;\n"); task.append(" }\n\n"); task.append(" ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\\@sent_gold, $i_w) ;\n\n"); task.append(" $con_bef = $w_1 ;\n"); task.append(" $con_bef_2 = $w_2.' + '.$w_1 ;\n"); task.append(" $con_aft = $w1 ;\n"); task.append(" $con_aft_2 = $w1.' + '.$w2 ;\n\n"); task.append(" $con_pos_bef = $p_1 ;\n"); task.append(" $con_pos_bef_2 = $p_2.'+'.$p_1 ;\n"); task.append(" $con_pos_aft = $p1 ;\n"); task.append(" $con_pos_aft_2 = $p1.'+'.$p2 ;\n\n"); task.append(" @cur_err = ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) ;\n\n"); task.append(" # printf \"# %-25s %-15s %-10s %-25s %-3s %-30s\\n\",\n"); task.append(" # $con_bef, $word, $pos, $con_aft, $head_err, $dep_err ;\n\n"); task.append(" @bits = (0, 0, 0, 0, 0, 0) ;\n"); task.append(" $j = 0 ;\n\n"); task.append(" while ($j == 0)\n"); task.append(" {\n"); task.append(" for ($i = 0; $i <= $#bits; $i++)\n"); task.append(" {\n"); task.append(" if ($bits[$i] == 0)\n"); task.append(" {\n"); task.append(" $bits[$i] = 1 ;\n"); task.append(" $j = 0 ;\n"); task.append(" last ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" $bits[$i] = 0 ;\n"); task.append(" $j = 1 ;\n"); task.append(" }\n"); task.append(" }\n\n"); task.append(" @e_bits = @cur_err ;\n\n"); task.append(" for ($i = 0; $i <= $#bits; $i++)\n"); task.append(" {\n"); task.append(" if (! $bits[$i])\n"); task.append(" {\n"); task.append(" $e_bits[$i] = '*' ;\n"); task.append(" }\n"); task.append(" }\n\n"); task.append(" # include also the last case which is the most general\n"); task.append(" # (wildcards for everything)\n"); task.append(" $err_counts{$err}{join($sep, @e_bits)}++ ;\n\n"); task.append(" }\n\n"); task.append(" } # loop on words\n"); task.append("} # second reading loop\n\n"); task.append("printf OUT \"\\n\\n\" ;\n"); task.append("printf OUT \" Specific errors, %d most frequent errors:\", $freq_err_num ;\n"); task.append("printf OUT \"\\n %s\\n\", '=' x 41 ;\n\n\n"); task.append("# deleting local contexts which are too general\n\n"); task.append("foreach $err (keys %err_counts)\n"); task.append("{\n"); task.append(" foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}}\n"); task.append(" keys %{$err_counts{$err}})\n"); task.append(" {\n"); task.append(" @cur_err = split(/\\Q$sep\\E/, $loc_con) ;\n\n"); task.append(" # In this loop, one or two elements of the local context are\n"); task.append(" # replaced with '*' to make it more general. If the entry for\n"); task.append(" # the general context has the same count it is removed.\n\n"); task.append(" foreach $i (0 .. $#cur_err)\n"); task.append(" {\n"); task.append(" $w1 = $cur_err[$i] ;\n"); task.append(" if ($cur_err[$i] eq '*')\n"); task.append(" {\n"); task.append(" next ;\n"); task.append(" }\n"); task.append(" $cur_err[$i] = '*' ;\n"); task.append(" $con1 = join($sep, @cur_err) ;\n"); task.append(" if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con})\n"); task.append(" && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con}))\n"); task.append(" {\n"); task.append(" delete $err_counts{$err}{$con1} ;\n"); task.append(" }\n"); task.append(" for ($j = $i+1; $j <=$#cur_err; $j++)\n"); task.append(" {\n"); task.append(" if ($cur_err[$j] eq '*')\n"); task.append(" {\n"); task.append(" next ;\n"); task.append(" }\n"); task.append(" $w2 = $cur_err[$j] ;\n"); task.append(" $cur_err[$j] = '*' ;\n"); task.append(" $con1 = join($sep, @cur_err) ;\n"); task.append(" if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con})\n"); task.append(" && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con}))\n"); task.append(" {\n"); task.append(" delete $err_counts{$err}{$con1} ;\n"); task.append(" }\n"); task.append(" $cur_err[$j] = $w2 ;\n"); task.append(" }\n"); task.append(" $cur_err[$i] = $w1 ;\n"); task.append(" }\n"); task.append(" }\n"); task.append("}\n\n"); task.append("# Leaving only the topmost local contexts for each error\n\n"); task.append("foreach $err (keys %err_counts)\n"); task.append("{\n"); task.append(" $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[$spec_err_loc_con-1] || 0 ;\n\n"); task.append(" # of the threshold is too low, take the 2nd highest count\n"); task.append(" # (the highest may be the total which is the generic case\n"); task.append(" # and not relevant for printing)\n\n"); task.append(" if ($thresh < 5)\n"); task.append(" {\n"); task.append(" $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[1] ;\n"); task.append(" }\n\n"); task.append(" foreach $loc_con (keys %{$err_counts{$err}})\n"); task.append(" {\n"); task.append(" if ($err_counts{$err}{$loc_con} < $thresh)\n"); task.append(" {\n"); task.append(" delete $err_counts{$err}{$loc_con} ;\n"); task.append(" }\n"); task.append(" else\n"); task.append(" {\n"); task.append(" if ($loc_con ne join($sep, ('*', '*', '*', '*', '*', '*')))\n"); task.append(" {\n"); task.append(" $loc_con_err_counts{$loc_con}{$err} = $err_counts{$err}{$loc_con} ;\n"); task.append(" }\n"); task.append(" }\n"); task.append(" }\n"); task.append("}\n\n"); task.append("# printing an error summary\n\n"); task.append("# calculating the context field length\n\n"); task.append("$max_word_spec_len= length('word') ;\n"); task.append("$max_con_aft_len = length('word') ;\n"); task.append("$max_con_bef_len = length('word') ;\n"); task.append("$max_con_pos_len = length('CPOS') ;\n\n"); task.append("foreach $err (keys %err_counts)\n"); task.append("{\n"); task.append(" foreach $loc_con (sort keys %{$err_counts{$err}})\n"); task.append(" {\n"); task.append(" ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =\n"); task.append(" split(/\\Q$sep\\E/, $loc_con) ;\n\n"); task.append(" $l = uni_len($word) ;\n"); task.append(" if ($l > $max_word_spec_len)\n"); task.append(" {\n"); task.append(" $max_word_spec_len = $l ;\n"); task.append(" }\n\n"); task.append(" $l = uni_len($con_bef) ;\n"); task.append(" if ($l > $max_con_bef_len)\n"); task.append(" {\n"); task.append(" $max_con_bef_len = $l ;\n"); task.append(" }\n\n"); task.append(" $l = uni_len($con_aft) ;\n"); task.append(" if ($l > $max_con_aft_len)\n"); task.append(" {\n"); task.append(" $max_con_aft_len = $l ;\n"); task.append(" }\n\n"); task.append(" if (length($con_pos_aft) > $max_con_pos_len)\n"); task.append(" {\n"); task.append(" $max_con_pos_len = length($con_pos_aft) ;\n"); task.append(" }\n\n"); task.append(" if (length($con_pos_bef) > $max_con_pos_len)\n"); task.append(" {\n"); task.append(" $max_con_pos_len = length($con_pos_bef) ;\n"); task.append(" }\n"); task.append(" }\n"); task.append("}\n\n"); task.append("$err_counter = 0 ;\n\n"); task.append("foreach $err (sort {$freq_err{$b} <=> $freq_err{$a}} keys %freq_err)\n"); task.append("{\n\n"); task.append(" ($head_err, $head_aft_bef, $dep_err) = split(/\\Q$sep\\E/, $err) ;\n\n"); task.append(" $err_counter++ ;\n"); task.append(" $err_desc{$err} = sprintf(\"%2d. \", $err_counter).\n"); task.append(" describe_err($head_err, $head_aft_bef, $dep_err) ;\n\n"); task.append(" # printf OUT \" %-3s %-30s %d\\n\", $head_err, $dep_err, $freq_err{$err} ;\n"); task.append(" printf OUT \"\\n\" ;\n"); task.append(" printf OUT \" %s : %d times\\n\", $err_desc{$err}, $freq_err{$err} ;\n\n"); task.append(" printf OUT \" %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\\n\",\n"); task.append(" '-' x $max_con_pos_len, '-' x $max_con_bef_len,\n"); task.append(" '-' x $max_pos_len, '-' x $max_word_spec_len,\n"); task.append(" '-' x $max_con_pos_len, '-' x $max_con_aft_len ;\n\n"); task.append(" printf OUT \" %-*s | %-*s | %-*s | %s\\n\",\n"); task.append(" $max_con_pos_len+$max_con_bef_len+3, ' Before',\n"); task.append(" $max_word_spec_len+$max_pos_len+3, ' Focus',\n"); task.append(" $max_con_pos_len+$max_con_aft_len+3, ' After',\n"); task.append(" 'Count' ;\n\n"); task.append(" printf OUT \" %-*s %-*s | %-*s %-*s | %-*s %-*s |\\n\",\n"); task.append(" $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word',\n"); task.append(" $max_pos_len, 'CPOS', $max_word_spec_len, 'word',\n"); task.append(" $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ;\n\n"); task.append(" printf OUT \" %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\\n\",\n"); task.append(" '-' x $max_con_pos_len, '-' x $max_con_bef_len,\n"); task.append(" '-' x $max_pos_len, '-' x $max_word_spec_len,\n"); task.append(" '-' x $max_con_pos_len, '-' x $max_con_aft_len ;\n\n"); task.append(" foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}}\n"); task.append(" keys %{$err_counts{$err}})\n"); task.append(" {\n"); task.append(" if ($loc_con eq join($sep, ('*', '*', '*', '*', '*', '*')))\n"); task.append(" {\n"); task.append(" next ;\n"); task.append(" }\n\n"); task.append(" $con1 = $loc_con ;\n"); task.append(" $con1 =~ s/\\*/ /g ;\n\n"); task.append(" ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =\n"); task.append(" split(/\\Q$sep\\E/, $con1) ;\n\n"); task.append(" printf OUT \" %-*s | %-*s | %-*s | %-*s | %-*s | %-*s | %3d\\n\",\n"); task.append(" $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef,\n"); task.append(" $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word,\n"); task.append(" $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft,\n"); task.append(" $err_counts{$err}{$loc_con} ;\n"); task.append(" }\n\n"); task.append(" printf OUT \" %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\\n\",\n"); task.append(" '-' x $max_con_pos_len, '-' x $max_con_bef_len,\n"); task.append(" '-' x $max_pos_len, '-' x $max_word_spec_len,\n"); task.append(" '-' x $max_con_pos_len, '-' x $max_con_aft_len ;\n\n"); task.append("}\n\n"); task.append("printf OUT \"\\n\\n\" ;\n"); task.append("printf OUT \" Local contexts involved in several frequent errors:\" ;\n"); task.append("printf OUT \"\\n %s\\n\", '=' x 51 ;\n"); task.append("printf OUT \"\\n\\n\" ;\n\n"); task.append("foreach $loc_con (sort {scalar keys %{$loc_con_err_counts{$b}} <=>\n"); task.append(" scalar keys %{$loc_con_err_counts{$a}}}\n"); task.append(" keys %loc_con_err_counts)\n"); task.append("{\n\n"); task.append(" if (scalar keys %{$loc_con_err_counts{$loc_con}} == 1)\n"); task.append(" {\n"); task.append(" next ;\n"); task.append(" }\n\n"); task.append(" printf OUT \" %s-+-%s-+-%s-+-%s-+-%s-+-%s-\\n\",\n"); task.append(" '-' x $max_con_pos_len, '-' x $max_con_bef_len,\n"); task.append(" '-' x $max_pos_len, '-' x $max_word_spec_len,\n"); task.append(" '-' x $max_con_pos_len, '-' x $max_con_aft_len ;\n\n"); task.append(" printf OUT \" %-*s | %-*s | %-*s \\n\",\n"); task.append(" $max_con_pos_len+$max_con_bef_len+3, ' Before',\n"); task.append(" $max_word_spec_len+$max_pos_len+3, ' Focus',\n"); task.append(" $max_con_pos_len+$max_con_aft_len+3, ' After' ;\n\n"); task.append(" printf OUT \" %-*s %-*s | %-*s %-*s | %-*s %-*s \\n\",\n"); task.append(" $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word',\n"); task.append(" $max_pos_len, 'CPOS', $max_word_spec_len, 'word',\n"); task.append(" $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ;\n\n"); task.append(" printf OUT \" %s-+-%s-+-%s-+-%s-+-%s-+-%s-\\n\",\n"); task.append(" '-' x $max_con_pos_len, '-' x $max_con_bef_len,\n"); task.append(" '-' x $max_pos_len, '-' x $max_word_spec_len,\n"); task.append(" '-' x $max_con_pos_len, '-' x $max_con_aft_len ;\n\n"); task.append(" $con1 = $loc_con ;\n"); task.append(" $con1 =~ s/\\*/ /g ;\n\n"); task.append(" ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =\n"); task.append(" split(/\\Q$sep\\E/, $con1) ;\n\n"); task.append(" printf OUT \" %-*s | %-*s | %-*s | %-*s | %-*s | %-*s \\n\",\n"); task.append(" $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef,\n"); task.append(" $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word,\n"); task.append(" $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft ;\n\n"); task.append(" printf OUT \" %s-+-%s-+-%s-+-%s-+-%s-+-%s-\\n\",\n"); task.append(" '-' x $max_con_pos_len, '-' x $max_con_bef_len,\n"); task.append(" '-' x $max_pos_len, '-' x $max_word_spec_len,\n"); task.append(" '-' x $max_con_pos_len, '-' x $max_con_aft_len ;\n\n"); task.append(" foreach $err (sort {$loc_con_err_counts{$loc_con}{$b} <=>\n"); task.append(" $loc_con_err_counts{$loc_con}{$a}}\n"); task.append(" keys %{$loc_con_err_counts{$loc_con}})\n"); task.append(" {\n"); task.append(" printf OUT \" %s : %d times\\n\", $err_desc{$err},\n"); task.append(" $loc_con_err_counts{$loc_con}{$err} ;\n"); task.append(" }\n\n"); task.append(" printf OUT \"\\n\" ;\n"); task.append("}\n\n"); task.append("close GOLD ;\n"); task.append("close SYS ;\n\n"); task.append("close OUT ;\n"); return task.toString(); } }