package com.cognitionis.feature_builder; import java.io.*; import java.util.*; import com.cognitionis.nlp_files.*; import com.cognitionis.utils_basickit.XmlAttribs; /** * * @author Héctor Llorens * @since 2011 */ public class Timen { public static String get_timen(String features_and_attributes, String lang) { String output; PipesFile nlpfile = new PipesFile(features_and_attributes); ((PipesFile) nlpfile).isWellFormedOptimist(); output = getTimenFormat((PipesFile) nlpfile, new Locale(lang)); return output; } public static String getTimenFormat(PipesFile pipesfile, Locale l) { String outputfile = null; int numline = 0; try { outputfile = pipesfile.getFile().getCanonicalPath() + ".TempEval-classik-features"; BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile())); HashMap<String, String[]> DCTs = TempEvalFiles.getDCTsFromTab(pipesfile.getFile().getParent() + "/dct.tab"); int iob2col = pipesfile.getColumn("element\\(IOB2\\)"); int attrscol = iob2col + 1; int filecol = pipesfile.getColumn("file"); int tokencol = pipesfile.getColumn("(word|token)"); int tensecol = pipesfile.getColumn("tense"); String file =""; String word = ""; String id = ""; String tense = ""; if (iob2col == -1 || tokencol == -1) { String notFoundCol = ""; if (iob2col == -1) { notFoundCol += "element,attribs,"; } if (tokencol == -1) { notFoundCol += "word/token,"; } throw new Exception("Some of the required columns (element,word/token,POS) not found: " + notFoundCol); } String pipeslineant = "--prior first line--"; try { String line; String[] linearr; HashMap<String, String> tempexAttribsHash = null; while ((line = pipesreader.readLine()) != null) { numline++; linearr = line.split("\\|"); if (linearr.length >= pipesfile.getPipesDescArrCount()) { if (linearr[iob2col].matches("B-.*")) { if (!word.equals("")) { outfile.write(id + "|" + word + "|" + tense + "|"+DCTs.get(file)[0]+"\n"); id = ""; word = ""; tense = ""; } word = linearr[tokencol]; tense = linearr[tensecol]; file = linearr[filecol]; if (linearr[attrscol].matches(".*=.*=.*") && !linearr[attrscol].contains(";")) { tempexAttribsHash = XmlAttribs.parseXMLattrs(linearr[attrscol]); } else { tempexAttribsHash = XmlAttribs.parseSemiColonAttrs(linearr[attrscol]); } id = tempexAttribsHash.get("tid"); } if (linearr[iob2col].matches("I-.*")) { if (word.equals("")) { throw new Exception("Malformed annotation: " + line + "\n Prev: " + pipeslineant); } word += "_" + linearr[tokencol]; } } pipeslineant = line; } if (!word.equals("")) { outfile.write(id + "|" + word + "|" + tense + "|"+DCTs.get(file)[0]+"\n"); } } finally { if (pipesreader != null) { pipesreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (CLASSIK):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } }