package com.cognitionis.feature_builder; /** * * @author Héctor Llorens * @since 2011 */ import com.cognitionis.nlp_files.*; import java.io.*; import java.util.*; import com.cognitionis.utils_basickit.*; import com.cognitionis.external_tools.*; import com.cognitionis.knowledgek.NUMEK.NUMEK; import com.cognitionis.nlp_files.parentical_parsers.*; public class BaseTokenFeatures { /** * Returns the input TabFile, annotated with the featurevector for the approach and language * * @param lang language code (en for English, es for Spanish) * @param file input TabFile (base-segmentation.tab format) * @param feature_vector features to be obtained (default: TempEval2-features) * @param approach approach required features (TIPSem or TIPSemB) * * @return outputfilename */ public static void getFeatures4Tab(String lang, String file, String feature_vector, String approach) { NLPFile nlpfile = null; String output = null; try { if (approach.matches("(?i)(TIPSem|TIPSemB)")) { if (feature_vector.matches("(TempEval2-features|(Dynamic|Static)Win-features)")) { nlpfile = new TabFile(file); nlpfile.setLanguage(lang); System.err.println("Executing GET_PIPES"); output = ((TabFile) nlpfile).getPipesFile(); output = FileUtils.renameTo(output, "\\.tab\\.pipes", "\\.TempEval-bs"); if (approach.equalsIgnoreCase("TIPSem")) { if (lang.equalsIgnoreCase("EN")) { nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); System.err.println("Executing TO_PLAIN"); output = ((PipesFile) nlpfile).toPlain(); System.err.println("Executing SRL_Roth"); File f = new File(output + ".roth"); if (!f.exists()) { output = SRL_Roth.run(output, 0); } else { System.err.println(" OMITING"); output = output + ".roth"; } System.err.println("Executing TREETAG"); f = new File(output + "-treetag"); if (!f.exists()) { output = TreeTagger.run_and_merge(output, 0, 2, 6); } else { System.err.println(" OMITING"); output = output + "-treetag"; } output = WN_features(output, lang); output = roles_features(output, lang); nlpfile=new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); String model = nlpfile.getFile().toString().substring(0, nlpfile.getFile().toString().lastIndexOf(".plain")); if (model.endsWith("\\.tml")) { output = addFileSentTokenColumns(((PipesFile) nlpfile)); } else { System.err.println("Pairing PIPES"); output = ((PipesFile) nlpfile).pair_pipes_by_column_JOIN(0, model, 3); output = FileUtils.renameTo(output, "\\.TempEval-bs\\.plain\\.roth-treetag-WNHyps-roleconfig-simpleroles-mainphrases\\.paired", "\\.TempEval2-features"); } } // already processed with AnCora if (lang.equalsIgnoreCase("ES")) { nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); File ancora = new File(file.substring(0, file.lastIndexOf(".")) + ".TempEval-bs.plain.roth-treetag"); if (ancora.exists() && ancora.isFile()) { output = WN_features(ancora.getAbsolutePath(), lang); output = roles_features(output, lang); ((PipesFile) nlpfile).isWellFormedOptimist(); String model = nlpfile.getFile().toString().substring(0, nlpfile.getFile().toString().lastIndexOf(".plain")); if (model.endsWith("\\.tml")) { output = addFileSentTokenColumns(((PipesFile) nlpfile)); } else { System.err.println("Pairing PIPES"); output = ((PipesFile) nlpfile).pair_pipes_by_column_JOIN(0, model, 3); output = FileUtils.renameTo(output, "\\.TempEval-bs\\.plain\\.roth-treetag-WNHyps-roleconfig-simpleroles-mainphrases\\.paired", "\\.TempEval2-features"); } } else { nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); System.err.println("Executing TO_TOKENS"); output = ((PipesFile) nlpfile).saveColumnFile("word"); System.err.println("Executing FreeLing"); File f = new File(output + ".freeling"); if (!f.exists()) { output = FreeLing.run(output, lang, 0); } else { System.err.println(" OMITING"); output = output + ".freeling"; } output = WN_features(output, lang); nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); System.err.println("Executing lemmaPOS2TempEval2_features"); output = lemmaPOS2TempEval2_features((PipesFile) nlpfile, lang); nlpfile=new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); String model = nlpfile.getFile().toString().substring(0, nlpfile.getFile().toString().lastIndexOf(".word")); System.err.println("Pairing PIPES"); output = ((PipesFile) nlpfile).pair_pipes_by_column_JOIN(0, model, 3); output = FileUtils.renameTo(output, "\\.TempEval-bs\\.word\\.freeling-WNHyps-POS2\\.paired", "\\.TempEval2-features"); } } if (feature_vector.equalsIgnoreCase("DynamicWin-features")) { nlpfile=new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); System.err.println("Executing DynamicWin features"); output = BaseTokenFeatures.getDynamicWin((PipesFile) nlpfile); output = FileUtils.renameTo(output, "\\.TempEval2-features\\.DynamicWin-features", "\\.DynamicWin-features"); } if (feature_vector.equalsIgnoreCase("StaticWin-features")) { nlpfile=new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); System.err.println("Executing StaticcWin features"); output = BaseTokenFeatures.getStaticWin((PipesFile) nlpfile); output = FileUtils.renameTo(output, "\\.TempEval2-features\\.StaticWin-features", "\\.StaticWin-features"); } } else { //TIPSem-B if (lang.equalsIgnoreCase("EN")) { nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing TO_TOKENS"); } output = ((PipesFile) nlpfile).saveColumnFile("word"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing treetag"); } File f = new File(output + ".treetag"); if (!f.exists()) { output = TreeTagger.run_tok(output); } else { System.err.println(" OMITING"); output = output + ".treetag"; } nlpfile = new PipesFile(output); ((PipesFile) nlpfile).isWellFormedOptimist(); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing lemmaPOS2TempEval2_features"); } output = lemmaPOS2TempEval2_features((PipesFile) nlpfile, lang); nlpfile=new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); String model = nlpfile.getFile().toString().substring(0, nlpfile.getFile().toString().lastIndexOf(".word")); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Pairing PIPES"); } output = ((PipesFile) nlpfile).pair_pipes_by_column_JOIN(0, model, 3); output = FileUtils.renameTo(output, "\\.TempEval-bs\\.word\\.treetag-POS2\\.paired", "\\.TempEval2-features"); } if (lang.equalsIgnoreCase("ES")) { nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing TO_TOKENS"); } output = ((PipesFile) nlpfile).saveColumnFile("word"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing FreeLing"); } File f = new File(output + ".freeling"); if (!f.exists()) { output = FreeLing.run(output, lang, 0); } else { System.err.println(" OMITING"); output = output + ".freeling"; } nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing lemmaPOS2TempEval2_features"); } output = lemmaPOS2TempEval2_features((PipesFile) nlpfile, lang); nlpfile=new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); String model = nlpfile.getFile().toString().substring(0, nlpfile.getFile().toString().lastIndexOf(".word")); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Pairing PIPES"); } output = ((PipesFile) nlpfile).pair_pipes_by_column_JOIN(0, model, 3); output = FileUtils.renameTo(output, "\\.TempEval-bs\\.word\\.freeling-POS2\\.paired", "\\.TempEval2-features"); } } } else { throw new Exception("Unknown feature vector: " + feature_vector); } } else { throw new Exception("Unknown approach: " + approach); } } catch (Exception e) { System.err.println("Errors found (Experimenter):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } } } /** * Returns the input PlainFile, annotated with the featurevector for the approach and language * * @param lang language code (en for English, es for Spanish) * @param file input PlainFile (text.txt format) * @param tokenize 0 false 1 true * @param clean false|true delete temporal files * @param feature_vector features to be obtained (default: TempEval2-features) * @param approach approach required features (TIPSem or TIPSemB) * * @return outputfilename */ public static String getFeatures4Plain(String lang, String file, int tokenize, boolean clean, String feature_vector, String approach) { NLPFile nlpfile = null; String output = null; try { output = file; if (approach.matches("(?i)(TIPSem|TIPSemB)")) { if (feature_vector.matches("(TempEval2-features|(Dynamic|Static)Win-features)")) { if (approach.equalsIgnoreCase("TIPSem")) { if (lang.equalsIgnoreCase("EN")) { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing SRL_Roth"); } File f = new File(output + ".roth"); if (!f.exists()) { output = SRL_Roth.run(output, tokenize); } else { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println(" OMITING"); } output = output + ".roth"; } if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing TREETAG"); } f = new File(output + "-treetag"); if (!f.exists()) { output = TreeTagger.run_and_merge(output, 0, 2, 6); } else { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println(" OMITING"); } output = output + "-treetag"; } output = WN_features(output, lang); output = roles_features(output, lang); } if (lang.equalsIgnoreCase("ES")) { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing Freeling"); } File f = new File(output + ".freeling"); if (!f.exists()) { output = FreeLing.run(output, lang, tokenize); } else { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println(" OMITING"); } output = output + ".freeling"; } output = WN_features(output, lang); nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing lemmaPOS2TempEval2_features"); } output = lemmaPOS2TempEval2_features((PipesFile) nlpfile, lang); } nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); output = GetPairSpecialTempEval2_features(((PipesFile) nlpfile), file); } else { // TIPSem-B if (lang.equalsIgnoreCase("EN")) { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing TREETAG"); } File f = new File(output + ".treetag"); if (!f.exists()) { output = TreeTagger.run_tok(output); } else { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println(" OMITING"); } output = output + ".treetag"; } nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing lemmaPOS2TempEval2_features"); } output = lemmaPOS2TempEval2_features((PipesFile) nlpfile, lang); nlpfile=new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); output = GetPairSpecialTempEval2_features(((PipesFile) nlpfile), file); } if (lang.equalsIgnoreCase("ES")) { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing Freeling"); } File f = new File(output + ".freeling"); if (!f.exists()) { output = FreeLing.run(output, lang, tokenize); } else { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println(" OMITING"); } output = output + ".freeling"; } nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing lemmaPOS2TempEval2_features"); } output = lemmaPOS2TempEval2_features((PipesFile) nlpfile, lang); nlpfile=new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); output = GetPairSpecialTempEval2_features(((PipesFile) nlpfile), file); } } } else { throw new Exception("Unknown feature vector: " + feature_vector); } } else { throw new Exception("Unknown approach: " + approach); } if (clean) { try { String[] command = {"/bin/sh", "-c", "rm -rf " + file + ".treetag* " + file + ".freeling* " + file + ".roth*"}; Process p = Runtime.getRuntime().exec(command); if (p != null) { p.getInputStream().close(); p.getOutputStream().close(); p.getErrorStream().close(); p.destroy(); } } catch (Exception e) { System.err.println("\nErrors found (TIMEE):\n\t" + e.toString() + "\n"); e.printStackTrace(System.err); } } } catch (Exception e) { System.err.println("Errors found (Experimenter):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return output; } public static String WN_features(String output, String lang) { NLPFile nlpfile = null; try { nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing WN_HYPS"); } File f = new File((nlpfile.getFile()).getCanonicalPath() + "-WNHyps"); if (!f.exists()) { output = BaseTokenFeatures.getWNHyps((PipesFile) nlpfile, lang); } else { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("OMITING"); } output = (nlpfile.getFile()).getCanonicalPath() + "-WNHyps"; } } catch (Exception e) { System.err.println("Errors found (Experimenter):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return output; } public static String roles_features(String output, String lang) { NLPFile nlpfile = null; try { nlpfile = new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing ROLECONF"); } output = getVerbRoleconfig((PipesFile) nlpfile); nlpfile=new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing SIMPLEROLES"); } output = getSimpleRoles((PipesFile) nlpfile); nlpfile=new PipesFile(output); nlpfile.setLanguage(lang); ((PipesFile) nlpfile).isWellFormedOptimist(); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Executing MAINPHRASES"); } //output = ((PipesFile) nlpfile).getMainPhrases(); output = getMainPhrasesPPdetail((PipesFile) nlpfile); } catch (Exception e) { System.err.println("Errors found (Experimenter):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return output; } public static String addFileSentTokenColumns(PipesFile pipesfile) { String outputfile = null; try { outputfile = pipesfile.getFile().getCanonicalPath() + ".TempEval2-features"; String filename = pipesfile.getFile().getCanonicalPath().substring(pipesfile.getFile().getCanonicalPath().lastIndexOf('/') + 1, pipesfile.getFile().getCanonicalPath().lastIndexOf(".tml")); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile())); try { int sentence = 0, token = 0; String pipesline; String[] pipesarr = null; int blanks_col = pipesfile.getColumn("leading(-|_)blanks"); while ((pipesline = pipesreader.readLine()) != null) { if (pipesline.trim().length() > 1) { pipesarr = pipesline.split("\\|"); //outfile.write(pipesfile.getFile().getName()+"|"+sentence+"|"+token); outfile.write(filename + "|" + sentence + "|" + token); for (int i = 0; i < pipesarr.length; i++) { if (i != blanks_col) { outfile.write("|" + pipesarr[i]); } } outfile.write("\n"); token++; } else { outfile.write(pipesline + "\n"); token = 1; sentence++; } } } finally { if (pipesreader != null) { pipesreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } /** * Returns the pipesfile splited in sentences by empty | * * @return outputfilename */ public String sentSplit(PipesFile pf) { String outputfile = pf.getFile().toString() + ".pipes"; int numline = 0; try { BufferedReader pipesreader = new BufferedReader(new FileReader(pf.getFile())); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); int sentcolumn = 1; int filecolumn = 0; try { String line; String numsent = "-1"; String filename = "-1"; while ((line = pipesreader.readLine()) != null) { numline++; String[] linearr = line.split("\\|"); if ((!filename.equals(linearr[filecolumn]) || !numsent.equals(linearr[sentcolumn])) && !numsent.equals("-1") && !filename.equals("-1")) { outfile.write("|\n"); } for (int i = 3; i < linearr.length - 1; i++) { outfile.write(linearr[i] + "|"); } outfile.write(linearr[linearr.length - 1] + "\n"); numsent = linearr[sentcolumn]; filename = linearr[filecolumn]; } outfile.write("|\n"); } finally { if (pipesreader != null) { pipesreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + " (line " + numline + ")\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } /** * Returns the wnhyps as the perl does * * @return outputfilename */ public static String getWNHyps(PipesFile pipesfile, String lang) { String outputfile = pipesfile.getFile().toString() + "-WNHyps"; int numline = 0; try { BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile())); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); int poscolumn = pipesfile.getColumn("pos"); int lemmacolumn = pipesfile.getColumn("lemma"); int last_desc_column = pipesfile.getLastDescColumn(); ArrayList<String> sentence = null; try { String line; int numsent = 0; while ((line = pipesreader.readLine()) != null) { numline++; String[] linearr = line.split("\\|"); //System.err.println(line); if (linearr.length >= pipesfile.getPipesDescArrCount()) { if (sentence == null) { sentence = new ArrayList(); } sentence.add(line); } else { int numtok = 0; // Write file plus WNHyps String WNHyps = "-"; WNInterface wn = new WNInterface(); for (String sline : sentence) { linearr = sline.split("\\|"); numtok++; if (linearr[poscolumn].matches("(V|N).*")) { if (lang.equalsIgnoreCase("EN")) { WNHyps = linearr[poscolumn].substring(0, 1).toLowerCase() + "-" + wn.getHypersHACK(linearr[lemmacolumn].toLowerCase(), linearr[poscolumn]); } if (lang.equalsIgnoreCase("ES")) { WNHyps = linearr[poscolumn].substring(0, 1).toLowerCase() + "-" + wn.getHypersHACKES2(linearr[lemmacolumn].toLowerCase(), linearr[poscolumn]); } } else { WNHyps = "-"; } for (int i = 0; i < linearr.length - 1; i++) { // There are roles columns in the sentence if (i == last_desc_column) { outfile.write(linearr[i] + "|" + WNHyps + "|"); } else { outfile.write(linearr[i] + "|"); } } // There arent roles columns in the sentences if (linearr.length - 1 == last_desc_column) { outfile.write(linearr[linearr.length - 1] + "|" + WNHyps); } else { outfile.write(linearr[linearr.length - 1]); } outfile.write("\n"); WNHyps = "-"; } outfile.write("|\n"); numsent++; sentence = null; } } } finally { if (pipesreader != null) { pipesreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TIMEE):\n\t" + e.toString() + " (line " + numline + ")\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } /** * Returns the TempEval-2 feature set for TreeTagger(en) and FreeLing (es) * Tense and polarity are calculated * * @return outputfilename */ public static String lemmaPOS2TempEval2_features(PipesFile pipesfile, String lang) { String outputfile = pipesfile.getFile().toString() + "-POS2"; int numline = 0; try { BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile())); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); int wordcolumn = pipesfile.getColumn("word"); int poscolumn = pipesfile.getColumn("pos"); int lemmacolumn = pipesfile.getColumn("lemma"); int wncolumn = pipesfile.getColumn("wn"); int wordposition = 0; ArrayList<String> sentence = null; ArrayList<String[]> verbs = null; ArrayList<String[]> ppdetail = null; try { String[] wordmem = new String[4]; String[] posmem = new String[2]; wordmem[2] = "-"; wordmem[1] = "-"; wordmem[0] = "-"; posmem[1] = "-"; posmem[0] = "-"; String line; int numsent = 0; while ((line = pipesreader.readLine()) != null) { numline++; wordposition++; String[] linearr = line.split("\\|"); String tense = "-"; String assertype = "-"; String auxiliary = "0"; if (linearr.length >= pipesfile.getPipesDescArrCount()) { wordmem[3] = wordmem[2]; wordmem[2] = wordmem[1]; wordmem[1] = wordmem[0]; wordmem[0] = linearr[wordcolumn].toLowerCase(); posmem[1] = posmem[0]; posmem[0] = linearr[poscolumn]; if (sentence == null) { sentence = new ArrayList(); verbs = new ArrayList<String[]>(); // position, lemma, tense, assertype ppdetail = new ArrayList<String[]>(); // start, end, text wordposition = 1; } //System.out.println(wordposition); sentence.add(line); if (linearr[poscolumn].startsWith("V")) { if (lang.equalsIgnoreCase("es")) { // Freeling tenses String FreelingTense = linearr[poscolumn].substring(1, 4); // 0 type, 1 mode, 2 time //if (FreelingTense.charAt(0) == 'M') { // only main verbs if (FreelingTense.charAt(0) == 'A' || FreelingTense.charAt(0) == 'S') { auxiliary = "1"; } if (FreelingTense.charAt(1) == 'G') { // gerundio if (wordmem[1].matches("(?:est(?:oy|ás|á|amos|áis|án|é|és|emos|éis|én))")) { tense = "present-continuous"; } else { if (wordmem[1].matches("(estaba(?:s|n|is)?|estábamos|estuvie(?:ra|se)(?:s|n|is)?|estuvié(?:ra|se)mos)")) { tense = "past-continuous"; } else { if (wordmem[1].equals("estado") && wordmem[2].matches("(?:he|has|ha|hemos|habéis|han)")) { tense = "present-perfect-compound-continuous"; } else { if (wordmem[1].equals("estado") && wordmem[2].matches("había(?:s|n|mos|is)?")) { tense = "past-perfect-compound-continuous"; } } } } } else { if (FreelingTense.charAt(1) == 'P') { // participio if (wordmem[1].matches("(?:he|has|ha|hemos|habéis|han)") || (wordmem[1].equals("sido") && wordmem[2].matches("(?:he|has|ha|hemos|habéis|han)"))) { tense = "present-perfect-compound"; } else { if (wordmem[1].matches("había(?:s|n|mos|is)?") || (wordmem[1].equals("sido") && wordmem[2].matches("había(?:s|n|mos|is)?"))) { tense = "past-perfect-compound"; } // there's another rare case (han estado siendo transportados...) but is lingusitically obscure... } } else { if (FreelingTense.charAt(1) == 'I' || FreelingTense.charAt(1) == 'S' || FreelingTense.charAt(1) == 'M') { // INDICATIVE, SUBJUNCTIVE, IMPERATIVE ...DISCARD INFINITVE... if (FreelingTense.charAt(2) == 'P') { tense = "present"; } else { if (FreelingTense.charAt(2) == 'I') { tense = "past-imperfect"; } else { if (FreelingTense.charAt(2) == 'S') { tense = "past-perfect-simple"; } else { if (FreelingTense.charAt(2) == 'F') { tense = "future"; } else { if (FreelingTense.charAt(2) == 'C') { tense = "conditional"; } } } } } } // hack for Spanish infinitives: NOT useful since we loose the tense of the sentence /*else{ if (FreelingTense.charAt(1) == 'N'){ tense = "present"; // generic infinitive verbs... (we can decide what to do with them) } }*/ } } //} if (!tense.equals("-")) { if (wordmem[1].matches("(no|nunca|jamas)") || (wordmem[1].matches("(se|me|nos|os|fu.+|he|has|ha|hemos|habéis|han|había(?:s|n|mos|is)?)") && (wordmem[2].matches("(no|nunca|jamás)") || wordmem[3].matches("(no|nunca|jamás)")))) { assertype = "negative"; } else { assertype = "positive"; } } } else { // en -- Treetager tenses // TODO falta todo lo de being (is being constructed) if (linearr[lemmacolumn].matches("(?:have|be|go|do)")) { auxiliary = "1"; } if (tense.equals("-") && (wordmem[1].matches("(?:.*ed|(?:was|were|did)(?:n't)?|been)") || (wordmem[2].matches("(?:.*ed|was|were|did|been)") && (wordmem[1].equals("to") || wordmem[1].matches("(not|n't)"))))) { tense = "past"; } // TODO might be continuous if (wordmem[1].matches("had(n't)?") || (wordmem[2].equals("had") && wordmem[1].matches("(not|n't|to)"))) { tense = "past-perfect"; } if (wordmem[1].matches("(have|has|'ve)(n't)?") || (wordmem[2].matches("(have|has|'ve)") && wordmem[1].matches("(not|n't|to)"))) { tense = "present-perfect"; } if (tense.equals("-") && wordmem[3].matches("(will|wo)") && wordmem[2].matches("(not|n't|have)") && wordmem[1].equals("be|to")) { tense = "future"; } if (tense.equals("-") && wordmem[3].equals("will") && wordmem[2].equals("have") && wordmem[1].equals("to")) { tense = "future"; } // generic hack for futures like will start crying, will start to cry, will|won't have to/be if (tense.equals("-") && (wordmem[1].equals("will") || wordmem[1].equals("won't") || wordmem[2].equals("will") || wordmem[3].equals("will") || (wordmem[3].equals("wo") && wordmem[2].equals("n't")) || (wordmem[2].equals("wo") && wordmem[1].equals("n't")))) { tense = "future"; } if (tense.equals("-") && wordmem[2].equals("going") && wordmem[1].matches("to")) { tense = "future"; } if (tense.equals("-") && (wordmem[1].matches("(?:would|may|might|should|must)") || ((wordmem[2].matches("(?:would|may|might|should|must)") || wordmem[2].matches("(?:would|should)n't")) && wordmem[1].matches("(?:not|n't|be|would(?:n't)?)")) || wordmem[3].matches("(?:would|may|might|should|must)") && wordmem[2].matches("(?:not|n't)") && wordmem[1].equals("be"))) { tense = "conditional"; } // super-hack for not geting past tenses from suposed that to be finished... if (wordmem[1].matches("be")) { tense = "present"; } if (tense.equals("-") && (wordmem[1].matches("(?:is|are|do)(?:n't)?") || (wordmem[2].matches("(?:is|are|do)") && wordmem[1].matches("(?:not|n't)")))) { tense = "present"; } // TODO might be continuous if (tense.equals("-") && (linearr[poscolumn].matches("VB(?:D|N)") || (linearr[poscolumn].equals("AUX") && wordmem[0].matches("(?i)(?:was|were)")))) { tense = "past"; } if (tense.equals("-") && (linearr[poscolumn].matches("VB(?:P|Z)") || (linearr[poscolumn].equals("AUX") && wordmem[0].matches("(?i)(?:was|were)")))) { tense = "present"; } if (tense.equals("-") && (linearr[poscolumn].equals("VBG"))) { // can be improved... tense = "present"; } // ignoring possessive 's if(linearr[wordcolumn].equalsIgnoreCase("'s") && !posmem[1].equalsIgnoreCase("PP")){ tense="-"; } if (!tense.equals("-")) { if (wordmem[1].matches("(?:.*n't|not|never)") || (wordmem[2].matches("(?:.*n't|not|never)")) || (wordmem[3].equals("not") && wordmem[2].equals("going") && wordmem[1].equals("to"))) { assertype = "negative"; } else { assertype = "positive"; } } } if (!tense.equals("-")) { if (!tense.endsWith("-continuous") && (pipesfile.getLanguage().equalsIgnoreCase("en") && linearr[wordcolumn].endsWith("ing") || pipesfile.getLanguage().equalsIgnoreCase("es") && linearr[wordcolumn].endsWith("ndo"))) { tense += "-continuous"; } //System.out.println(linearr[lemmacolumn] + "/"+wordposition); String[] verb = {"" + wordposition, linearr[lemmacolumn], tense, assertype, auxiliary}; if (verbs.size() > 0 && (verbs.get(verbs.size() - 1)[4].equals("1") && (wordposition - Integer.parseInt((verbs.get(verbs.size() - 1)[0]))) < 5)) { //System.out.println("yes"); verbs.remove(verbs.size() - 1); } verbs.add(verb); } } } else { int numtok = 0; int nextverbpositioncover = -1; int nextverb = 1; wordposition = -1; String verb[]; String depverb = "-"; if (verbs.size() != 0) { verb = verbs.get(0); wordposition = Integer.parseInt(verb[0]); tense = verb[2]; assertype = verb[3]; depverb = verb[1]; if (verbs.size() > 1) { nextverbpositioncover = Integer.parseInt(verbs.get(1)[0]); nextverbpositioncover = wordposition + (int) Math.ceil((nextverbpositioncover - wordposition) / 2.0); } } String pp = "-"; for (int i = 0; i < sentence.size(); i++) { String sline = sentence.get(i); linearr = sline.split("\\|"); numtok++; String simplepos = linearr[poscolumn]; String simplelemma = linearr[lemmacolumn]; if (linearr[poscolumn].matches("(IN|TO)")) { // if pp is just after end of last one is a multi-pp (2word) if (!pp.equals("-") && i > 0 && sentence.get(i - 1).split("\\|")[poscolumn].matches("(IN|TO)")) { pp += "_" + linearr[wordcolumn]; } else { pp += linearr[wordcolumn]; } } if (linearr[poscolumn].startsWith("V")) { pp = "-"; } if (lang.equalsIgnoreCase("es")) { if (simplepos.length() > 2) { if (simplepos.startsWith("V")) { simplepos = simplepos.substring(0, 4); } else { if (simplepos.startsWith("N")) { simplepos = simplepos.substring(0, 2) + simplepos.substring(3, 4); // N (Common or Proper) (S or Plural) } else { simplepos = simplepos.substring(0, 2); } } } if (linearr[wordcolumn].matches("([0-9]+[0-9./:,-]*|" + NUMEK.numbers_re_ES + ")")) { simplepos = "CD"; } } if (lang.equalsIgnoreCase("en")) { if (!simplepos.matches("(?i)(NP|NPS|NNP|NNPS)")) { simplelemma = simplelemma.toLowerCase(); // lemma to lower case } } if (numtok == nextverbpositioncover) { verb = verbs.get(nextverb); nextverb++; wordposition = Integer.parseInt(verb[0]); tense = verb[2]; assertype = verb[3]; depverb = verb[1]; if (verbs.size() > (nextverb)) { nextverbpositioncover = Integer.parseInt(verbs.get(nextverb)[0]); nextverbpositioncover = wordposition + (int) Math.ceil((nextverbpositioncover - wordposition) / 2); } } if (wncolumn == -1) { outfile.write(linearr[wordcolumn] + "|" + simplepos + "|siob|st|sy|-|" + simplelemma + "|wn|rc|sriob|sriobv|sr|" + depverb + "|" + tense + "|" + assertype + "|iobph|php|phi|" + pp + "\n"); } else { outfile.write(linearr[wordcolumn] + "|" + simplepos + "|siob|st|sy|-|" + simplelemma + "|" + linearr[wncolumn] + "|rc|sriob|sriobv|sr|" + depverb + "|" + tense + "|" + assertype + "|iobph|php|phi|" + pp + "\n"); } } outfile.write("\n"); numsent++; sentence = null; } } // IF LAST SENTENCE DOES NOT ENDED CORRECTLY if (sentence != null) { String[] linearr; String tense = "-"; String assertype = "-"; int numtok = 0; int nextverbpositioncover = -1; int nextverb = 1; wordposition = -1; String verb[]; String depverb = "-"; if (verbs.size() != 0) { verb = verbs.get(0); wordposition = Integer.parseInt(verb[0]); tense = verb[2]; assertype = verb[3]; depverb = verb[1]; if (verbs.size() > 1) { nextverbpositioncover = Integer.parseInt(verbs.get(1)[0]); nextverbpositioncover = wordposition + ((nextverbpositioncover - wordposition) / 2); } } for (int i = 0; i < sentence.size(); i++) { String sline = sentence.get(i); linearr = sline.split("\\|"); numtok++; String simplepos = linearr[poscolumn]; String simplelemma = linearr[lemmacolumn]; String pp = "-"; if (linearr[poscolumn].matches("(IN|TO)")) { // if pp is just after end of last one is a multi-pp (2word) if (!pp.equals("-") && i > 0 && sentence.get(i - 1).split("\\|")[poscolumn].matches("(IN|TO)")) { pp += "_" + linearr[wordcolumn]; } else { pp += linearr[wordcolumn]; } } if (linearr[poscolumn].startsWith("V")) { pp = "-"; } if (lang.equalsIgnoreCase("es")) { if (simplepos.length() > 2) { if (simplepos.startsWith("V")) { simplepos = simplepos.substring(0, 4); } else { if (simplepos.startsWith("N")) { simplepos = simplepos.substring(0, 2) + simplepos.substring(3, 4); // N (Common or Proper) (S or Plural) } else { simplepos = simplepos.substring(0, 2); } } } if (linearr[wordcolumn].matches("([0-9]+[0-9./:,-]*|" + NUMEK.numbers_re_ES + ")")) { simplepos = "CD"; } } if (lang.equalsIgnoreCase("en")) { if (!simplepos.matches("(?i)(NP|NPS|NNP|NNPS)")) { simplelemma = simplelemma.toLowerCase(); // lemma to lower case } } if (numtok == nextverbpositioncover) { verb = verbs.get(nextverb); nextverb++; wordposition = Integer.parseInt(verb[0]); tense = verb[2]; assertype = verb[3]; depverb = verb[1]; if (verbs.size() > (nextverb)) { nextverbpositioncover = Integer.parseInt(verbs.get(nextverb)[0]); nextverbpositioncover = wordposition + ((nextverbpositioncover - wordposition) / 2); } } if (wncolumn == -1) { outfile.write(linearr[wordcolumn] + "|" + simplepos + "|siob|st|sy|-|" + simplelemma + "|wn|rc|sriob|sriobv|sr|" + depverb + "|" + tense + "|" + assertype + "|iobph|php|phi|" + pp + "\n"); } else { outfile.write(linearr[wordcolumn] + "|" + simplepos + "|siob|st|sy|-|" + simplelemma + "|" + linearr[wncolumn] + "|rc|sriob|sriobv|sr|" + depverb + "|" + tense + "|" + assertype + "|iobph|php|phi|" + pp + "\n"); } } outfile.write("\n"); numsent++; sentence = null; } } finally { if (pipesreader != null) { pipesreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TIMEE):\n\t" + e.toString() + " (line " + numline + ")\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } /** * Returns the input file, plus one column only filled for each verb * * * example: * * (A0*) * * (V*) * * (A1* (A0*) * * (V*) * * * *) (AM-TMP*) * (A2*) * * (A3*) (A2*) * * Output: verb1->A0,V,A1,A2,A3 * verb2->A0,V,AM-TMP,A2 * * ** PULS HACK FOR NUM-LENGTH IN THE 3rd column * * @return outputfilename */ public static String getVerbRoleconfig(PipesFile pipesfile) { String outputfile = pipesfile.getFile().toString() + "-roleconfig"; int numline = 0; try { BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile())); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); int verbcolumn = pipesfile.getColumn("verb"); // each verb will have a roles column int last_desc_column = pipesfile.getLastDescColumn(); ArrayList<String> sentence = null; try { String line; int numsent = 0; ArrayList<SRLColParser> rolesverbs = null; while ((line = pipesreader.readLine()) != null) { numline++; String[] linearr = line.split("\\|"); if (linearr.length >= pipesfile.getPipesDescArrCount()) { if (sentence == null) { sentence = new ArrayList(); if (linearr.length - pipesfile.getPipesDescArrCount() > 0) { rolesverbs = new ArrayList<SRLColParser>(); } } if (!linearr[verbcolumn].equals("-")) { rolesverbs.add(new SRLColParser(linearr[verbcolumn], "*", 0)); } sentence.add(line); } else { int numtok = 0; // Parse roles for (String sline : sentence) { linearr = sline.split("\\|"); if (rolesverbs != null) { //System.out.println(sline+" "+rolesverbs.size()+"\n"); for (int srlcol = 0; srlcol < rolesverbs.size(); srlcol++) { SRLColParser tempsrl = rolesverbs.get(srlcol); tempsrl.parse(linearr[pipesfile.getPipesDescArrCount() + srlcol]); rolesverbs.set(srlcol, tempsrl); } } } // Write file plus rolesconf String rolesconf = "-"; int verbact = 0; for (String sline : sentence) { linearr = sline.split("\\|"); numtok++; if (rolesverbs != null && !linearr[verbcolumn].equals("-")) { rolesconf = rolesverbs.get(verbact).getRoleconf(); verbact++; } else { rolesconf = "-"; } for (int i = 0; i < linearr.length - 1; i++) { // There are roles columns in the sentence if (i == last_desc_column) { outfile.write(linearr[i] + "|" + rolesconf + "|"); } else { // HACK FOR NUM-LENGHT if (i == 3) { if (linearr[0].matches("[0-9]+")) { linearr[i] = "" + linearr[0].length(); } else { linearr[i] = "-"; } } outfile.write(linearr[i] + "|"); } } // There arent roles columns in the sentences if (linearr.length - 1 == last_desc_column) { outfile.write(linearr[linearr.length - 1] + "|" + rolesconf); } else { outfile.write(linearr[linearr.length - 1]); } outfile.write("\n"); rolesconf = "-"; } outfile.write("|\n"); numsent++; sentence = null; rolesverbs = null; } } } finally { if (pipesreader != null) { pipesreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TIMEE):\n\t" + e.toString() + " (line " + numline + ")\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } /** * Returns the input file, simplifying all the roles columns in just one * * The simplification process consists in getting always the smallest available role * taking into account the roles columns order. That is to say, if we beggin using roles * of the first column and the we switch to the second we only can go back to the first * if the role starts in this line (X*. THAT PREVENTS US TO MIX ROLES. * * example: * SIMPLIFICATION * (A0*) * (A0*) * (V*) * (V*) * (A1* (A0*) (A0*) * * (V*) (V*) * * * ---> PREVENTED TO BE A1 * *) (AM-TMP*) (AM-TMP*) * (A2*) * (A2*) ----> NOW WE CAN GO BACK * (A3*) (A2*) (A2*) ----> THE PREFERENCE IF EQ LENGHT IS FOR THE LAST ONE * * @param pipesfile the pipesfile over which the simple roles must be obtained * * @return outputfilename */ public static String getSimpleRoles(PipesFile pipesfile) { String outputfile = pipesfile.getFile().toString() + "-simpleroles"; int numline = 0; try { BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile())); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); int verbcolumn = pipesfile.getColumn("verb"); // each verb will have a roles column int poscolumn = pipesfile.getColumn("pos"); int wordcolumn = pipesfile.getColumn("(word|token)"); int rccolumn = pipesfile.getColumn("roleconf"); ArrayList<String> sentence = null; try { String[] wordmem = new String[4]; wordmem[2] = "-"; wordmem[1] = "-"; wordmem[0] = "-"; String line; int numsent = 0; ArrayList<SRLColParser> rolesverbs = null; ArrayList<String> roleconf = null; ArrayList<String> tenses = null; // tense(pseudo-aspect) //present-perfect == past ArrayList<String> assertype = null; // afirmative - negative while ((line = pipesreader.readLine()) != null) { numline++; String[] linearr = line.split("\\|"); if (linearr.length >= pipesfile.getPipesDescArrCount()) { wordmem[3] = wordmem[2]; wordmem[2] = wordmem[1]; wordmem[1] = wordmem[0]; wordmem[0] = linearr[wordcolumn]; if (sentence == null) { sentence = new ArrayList(); if (linearr.length - pipesfile.getPipesDescArrCount() > 0) { rolesverbs = new ArrayList<SRLColParser>(); tenses = new ArrayList<String>(); assertype = new ArrayList<String>(); roleconf = new ArrayList<String>(); } } if (!linearr[verbcolumn].equals("-")) { String tense = "-"; String assetype = "positive"; rolesverbs.add(new SRLColParser(linearr[verbcolumn], "*", 0)); roleconf.add(linearr[rccolumn]); if (pipesfile.getLanguage().equalsIgnoreCase("es")) { if (wordmem[1].matches("(no|nunca|jamas)") || (wordmem[1].matches("(se|fu.+|he|has|ha|hemos|habéis|han|había)") && wordmem[2].matches("(no|nunca|jamás)")) || (wordmem[3].matches("(no|nunca|jamás)") && wordmem[2].equals("se") && wordmem[1].matches("(he|has|ha|hemos|habéis|han|había)"))) { assetype = "negative"; } tense = linearr[poscolumn].substring(1, 4); if (tense.equals("PAS") || tense.equals("IMP")) { tense = "past"; } else { if (tense.equals("PRE")) { tense = "present"; } else { if (tense.equals("FUT")) { tense = "future"; } else { if (tense.equals("CON")) { tense = "conditional"; } else { } } } } if (linearr[poscolumn].length() >= 7 && linearr[poscolumn].substring(4, 7).equals("past")) { tense = "present-perfect"; } } else { if (tense.equals("-") && (wordmem[1].matches("(?:.*ed|(?:was|were|did)(?:n't)?|been)") || (wordmem[2].matches("(?:.*ed|was|were|did|been)") && (wordmem[1].equals("to") || wordmem[1].matches("(not|n't)"))))) { tense = "past"; } // TODO might be continuous if (wordmem[1].matches("had(n't)?") || (wordmem[2].equals("had") && wordmem[1].matches("(not|n't|to)"))) { tense = "past-perfect"; } if (wordmem[1].matches("(have|has|'ve)(n't)?") || (wordmem[2].matches("(have|has|'ve)") && wordmem[1].matches("(not|n't|to)"))) { tense = "present-perfect"; } if (tense.equals("-") && wordmem[3].matches("(will|wo)") && wordmem[2].matches("(not|n't|have)") && wordmem[1].equals("be|to")) { tense = "future"; } if (tense.equals("-") && wordmem[3].equals("will") && wordmem[2].equals("have") && wordmem[1].equals("to")) { tense = "future"; } // generic hack for futures like will start crying, will start to cry, will|won't have to/be if (tense.equals("-") && (wordmem[1].equals("will") || wordmem[1].equals("won't") || wordmem[2].equals("will") || wordmem[3].equals("will") || (wordmem[3].equals("wo") && wordmem[2].equals("n't")) || (wordmem[2].equals("wo") && wordmem[1].equals("n't")))) { tense = "future"; } if (tense.equals("-") && wordmem[2].equals("going") && wordmem[1].matches("to")) { tense = "future"; } if (tense.equals("-") && (wordmem[1].equals("would") || ((wordmem[2].equals("would") || wordmem[2].equals("wouldn't")) && wordmem[1].matches("(?:not|n't|be|would(?:n't)?)")) || wordmem[3].equals("would") && wordmem[2].matches("(?:not|n't)") && wordmem[1].equals("be"))) { tense = "conditional"; } if (tense.equals("-") && (wordmem[1].matches("(?:is|are|do)(?:n't)?") || (wordmem[2].matches("(?:is|are|do)") && wordmem[1].matches("(?:not|n't)")))) { tense = "present"; } // TODO might be continuous if (tense.equals("-") && (linearr[poscolumn].matches("VB(?:D|N)") || (linearr[poscolumn].equals("AUX") && wordmem[0].matches("(?i)(?:was|were)")))) { tense = "past"; } if (tense.equals("-") && (linearr[poscolumn].matches("VB(?:P|Z)") || (linearr[poscolumn].equals("AUX") && wordmem[0].matches("(?i)(?:was|were)")))) { tense = "present"; } if (tense.equals("-") && (linearr[poscolumn].equals("VBG"))) { // can be improved... tense = "present"; } if (!tense.equals("-")) { if (wordmem[1].matches("(?:.*n't|not|never)") || (wordmem[2].matches("(?:.*n't|not|never)")) || (wordmem[3].equals("not") && wordmem[2].equals("going") && wordmem[1].equals("to"))) { assetype = "negative"; } } } // TODO HANDLE INFINITIVES... if (!tense.equals("-")) { if (!tense.endsWith("-continuous") && (pipesfile.getLanguage().equalsIgnoreCase("en") && linearr[wordcolumn].endsWith("ing") || pipesfile.getLanguage().equalsIgnoreCase("es") && linearr[wordcolumn].endsWith("ndo"))) { tense += "-continuous"; } } if (tense.equals("-")) { tense = "present"; } tenses.add(tense); assertype.add(assetype); } sentence.add(line); } else { int numtok = 0; int active_roles_col = 0; for (String sline : sentence) { linearr = sline.split("\\|"); numtok++; // Roles String currentrole = "O"; String currentverb = "-"; String currentrc = "-"; String currentIOB2 = ""; String currentTense = "present"; String currentAssertype = "positive"; if (rolesverbs != null) { int currentrolesize = 100; int currentcol = 0; for (int srlcol = 0; srlcol < rolesverbs.size(); srlcol++) { // Parse roles SRLColParser tempsrl = rolesverbs.get(srlcol); tempsrl.parse(linearr[pipesfile.getPipesDescArrCount() + srlcol]); rolesverbs.set(srlcol, tempsrl); // Choose roles if (rolesverbs.get(srlcol).getSize() > 0 && rolesverbs.get(srlcol).getSize() <= currentrolesize) { if (srlcol >= active_roles_col || linearr[pipesfile.getPipesDescArrCount() + srlcol].startsWith("(")) { if (linearr[pipesfile.getPipesDescArrCount() + srlcol].startsWith("(")) { currentIOB2 = "B-"; } else { currentIOB2 = "I-"; } currentrole = rolesverbs.get(srlcol).getRole(); currentverb = rolesverbs.get(srlcol).getVerb(); currentrc = roleconf.get(srlcol); currentrolesize = rolesverbs.get(srlcol).getSize(); currentcol = srlcol; currentTense = tenses.get(srlcol); currentAssertype = assertype.get(srlcol); } } } if (rolesverbs.size() == 1) { currentverb = rolesverbs.get(0).getVerb(); currentTense = tenses.get(0); } active_roles_col = currentcol; } // "currentrole" in the new column simplerole for (int i = 0; i < pipesfile.getPipesDescArrCount() - 1; i++) { outfile.write(linearr[i] + "|"); } //System.err.println("tense="+currentTense); if (pipesfile.getLanguage().equalsIgnoreCase("es") && !linearr[verbcolumn].equals("-")) { currentrole = "V"; } outfile.write(currentrc + "|" + currentIOB2 + currentrole + "|" + currentIOB2 + currentrole + "+" + currentverb + "|" + currentrole + "|" + currentverb + "|" + currentTense + "|" + currentAssertype + "\n"); } outfile.write("|\n"); numsent++; sentence = null; rolesverbs = null; } } } finally { if (pipesreader != null) { pipesreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TIMEE):\n\t" + e.toString() + " (line " + numline + ")\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } /** * Returns the input file plus 4 columns * 1 a column indicating the BIO of the main phrases (arguments) * 2 a column indicating the position of the current token in this phrase * 3 a column indicating if the token is the header of the phrase * 4 a column (only for PP) indicating if the token is the secondary header of the phrase (NN, ADJ, ADV) * * @return outputfilename */ public static String getMainPhrasesPPdetail(PipesFile pipesfile) { String outputfile = pipesfile.getFile().toString() + "-mainphrases"; int numline = 0; try { int syntcolumn = pipesfile.getColumn("synt"); BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile())); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); ArrayList<String> sentence = null; int POScol = pipesfile.getColumn("pos"); int wordcol = pipesfile.getColumn("(word|token)"); try { String line; int numsent = 0; while ((line = pipesreader.readLine()) != null) { numline++; String[] linearr = line.split("\\|"); if (linearr.length >= pipesfile.getPipesDescArrCount()) { if (sentence == null) { sentence = new ArrayList(); } sentence.add(line); } else { //System.out.println("\n"); int numtok = 0; int phra_id = 0; SyntColParser syntparser = new SyntColParser(); String currentmainphrase = "O"; ArrayList<String[]> completemainphrase = new ArrayList<String[]>(); for (String sline : sentence) { linearr = sline.split("\\|"); numtok++; // Synt boolean hasClosingBrakets = false; if (linearr[syntcolumn].indexOf(')') != -1) { hasClosingBrakets = true; } if (hasClosingBrakets) { syntparser.parse(linearr[syntcolumn].substring(0, linearr[syntcolumn].indexOf(')'))); } else { syntparser.parse(linearr[syntcolumn]); } currentmainphrase = syntparser.getCurrentMainPhraseBIO(); if (currentmainphrase.equals("O") || (currentmainphrase.startsWith("B-") && completemainphrase.size() > 0)) { if (completemainphrase.size() > 0) { phra_id++; String PPdetail = "-"; if (completemainphrase.get(0)[pipesfile.getPipesDescArrCount()].equals("B-PP")) { for (String[] tmp_linearr : completemainphrase) { if (tmp_linearr[POScol].matches("(IN|TO)")) { if (PPdetail.equals("-")) { PPdetail = tmp_linearr[wordcol]; } else { PPdetail += "_" + tmp_linearr[wordcol]; } } else { break; } } } for (String[] tmp_linearr : completemainphrase) { for (int i = 0; i < tmp_linearr.length; i++) { outfile.write(tmp_linearr[i] + "|"); } outfile.write("phra" + phra_id + "|" + PPdetail + "\n"); } completemainphrase.clear(); } if (currentmainphrase.equals("O")) { for (int i = 0; i < pipesfile.getPipesDescArrCount(); i++) { outfile.write(linearr[i] + "|"); } outfile.write(syntparser.getCurrentMainPhraseBIO() + "|" + syntparser.getCurrentPositionInMainPhrase() + "|-|-\n"); } else { String[] tmp_linearr = new String[pipesfile.getPipesDescArrCount() + 2]; for (int i = 0; i < pipesfile.getPipesDescArrCount(); i++) { tmp_linearr[i] = linearr[i]; } tmp_linearr[pipesfile.getPipesDescArrCount()] = currentmainphrase; tmp_linearr[pipesfile.getPipesDescArrCount() + 1] = ((Integer) syntparser.getCurrentPositionInMainPhrase()).toString(); completemainphrase.add(tmp_linearr); } } else { String[] tmp_linearr = new String[pipesfile.getPipesDescArrCount() + 2]; for (int i = 0; i < pipesfile.getPipesDescArrCount(); i++) { tmp_linearr[i] = linearr[i]; } tmp_linearr[pipesfile.getPipesDescArrCount()] = currentmainphrase; tmp_linearr[pipesfile.getPipesDescArrCount() + 1] = ((Integer) syntparser.getCurrentPositionInMainPhrase()).toString(); completemainphrase.add(tmp_linearr); } //System.out.println(sline + " - " + syntparser.getParlevel()); if (hasClosingBrakets) { //System.out.println(syntparser.getFull() + " \n---> " + syntparser.getCurrent()); syntparser.parse(linearr[syntcolumn].substring(linearr[syntcolumn].indexOf(')'))); } //System.out.println(sline + " - " + syntparser.getParlevel()); } if (completemainphrase.size() > 0) { phra_id++; String PPdetail = "-"; if (completemainphrase.get(0)[pipesfile.getPipesDescArrCount()].equals("B-PP")) { for (String[] tmp_linearr : completemainphrase) { if (tmp_linearr[POScol].matches("(IN|TO)")) { if (PPdetail.equals("-")) { PPdetail = tmp_linearr[wordcol]; } else { PPdetail += "_" + tmp_linearr[wordcol]; } } else { break; } } } for (String[] tmp_linearr : completemainphrase) { for (int i = 0; i < tmp_linearr.length; i++) { outfile.write(tmp_linearr[i] + "|"); } outfile.write("phra" + phra_id + "|" + PPdetail + "\n"); } completemainphrase.clear(); } if (syntparser.getParlevel() != 0) { throw new Exception("Syntactic Parser ended with no 0 parlevel (" + syntparser.getParlevel() + ")."); } //System.out.println(numline); outfile.write("|\n"); numsent++; sentence = null; syntparser = null; } } } finally { if (pipesreader != null) { pipesreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TIMEE):\n\t" + e.toString() + " (line " + numline + ")\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } /** * Returns * A static n window context (default 9) whenever it is possible... * * @return outputfilename */ public static String getStaticWin(PipesFile pf) { String outputfile = pf.getFile().toString() + ".StaticWin-features"; int numline = 0; try { // Window configuration int window_size = 9; // must be a odd number if (window_size % 2 != 1) { throw new Exception("Window size must be a positive odd number (1,3,5,7,etc.)"); } int half_win_size = (window_size - 1) / 2; BufferedReader pipesreader = new BufferedReader(new FileReader(pf.getFile())); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); ArrayList<String> sentence = null; int POScol = pf.getColumn("pos"); int wordcol = pf.getColumn("(word|token)"); int lemmacol = pf.getColumn("lemma"); try { String curr_fileid = ""; String curr_sentN = ""; String line; String[] linearr; int numsent = 0; while ((line = pipesreader.readLine()) != null) { numline++; linearr = line.split("\\|"); if (curr_fileid.equals("")) { curr_fileid = linearr[0]; } if (curr_sentN.equals("")) { curr_sentN = linearr[1]; } //System.out.println(curr_fileid+" "+curr_sentN+" "+linearr[0]+" "+linearr[1]+"\n"); if (curr_fileid.equals(linearr[0]) && curr_sentN.equals(linearr[1])) { //System.out.println(curr_fileid+" adding "+curr_sentN+"\n"); if (sentence == null) { sentence = new ArrayList(); } sentence.add(line); } else { // update curr_markers curr_fileid = linearr[0]; curr_sentN = linearr[1]; //System.out.println("Processing "+curr_fileid+" "+curr_sentN+" "+linearr[0]+" "+linearr[1]+"\n"); String[] lemma_win = new String[window_size]; String[] pos_win = new String[window_size]; String lepo = "-"; for (int numtok = 0; numtok < sentence.size(); numtok++) { //System.out.println("processing token "+numtok+" size="+sentence.size()); for (int i = 0; i < window_size; i++) { int position = numtok + i - half_win_size; if (position < 0 || position >= sentence.size()) { lemma_win[i] = "-"; pos_win[i] = "-"; } else { linearr = sentence.get(position).split("\\|"); lemma_win[i] = linearr[lemmacol]; pos_win[i] = linearr[POScol]; if (position == numtok) { lepo = linearr[lemmacol] + "+" + linearr[POScol]; } } } outfile.write(sentence.get(numtok)); // write lemma window for (int i = 0; i < window_size; i++) { outfile.write("|" + lemma_win[i]); } // write lemma bigrams for (int i = 0; i < window_size - 1; i++) { outfile.write("|" + lemma_win[i] + "+" + lemma_win[i + 1]); } // write lemma trigrams for (int i = 0; i < window_size - 2; i++) { outfile.write("|" + lemma_win[i] + "+" + lemma_win[i + 1] + "+" + lemma_win[i + 2]); } // write POS window for (int i = 0; i < window_size; i++) { outfile.write("|" + pos_win[i]); } // write POS bigrams for (int i = 0; i < window_size - 1; i++) { outfile.write("|" + pos_win[i] + "+" + pos_win[i + 1]); } // wirte POS trigrams for (int i = 0; i < window_size - 2; i++) { outfile.write("|" + pos_win[i] + "+" + pos_win[i + 1] + "+" + pos_win[i + 2]); } outfile.write("|" + lepo + "\n"); } numsent++; sentence = null; sentence = new ArrayList(); sentence.add(line); } } if (sentence != null) { String[] lemma_win = new String[window_size]; String[] pos_win = new String[window_size]; String lepo = "-"; for (int numtok = 0; numtok < sentence.size(); numtok++) { for (int i = 0; i < window_size; i++) { int position = numtok + i - half_win_size; if (position < 0 || position >= sentence.size()) { lemma_win[i] = "-"; pos_win[i] = "-"; } else { linearr = sentence.get(position).split("\\|"); lemma_win[i] = linearr[lemmacol]; pos_win[i] = linearr[POScol]; if (position == numtok) { lepo = linearr[lemmacol] + "+" + linearr[POScol]; } } } outfile.write(sentence.get(numtok)); // write lemma window for (int i = 0; i < window_size; i++) { outfile.write("|" + lemma_win[i]); } // write lemma bigrams for (int i = 0; i < window_size - 1; i++) { outfile.write("|" + lemma_win[i] + "+" + lemma_win[i + 1]); } // write lemma trigrams for (int i = 0; i < window_size - 2; i++) { outfile.write("|" + lemma_win[i] + "+" + lemma_win[i + 1] + "+" + lemma_win[i + 2]); } // write POS window for (int i = 0; i < window_size; i++) { outfile.write("|" + pos_win[i]); } // write POS bigrams for (int i = 0; i < window_size - 1; i++) { outfile.write("|" + pos_win[i] + "+" + pos_win[i + 1]); } // wirte POS trigrams for (int i = 0; i < window_size - 2; i++) { outfile.write("|" + pos_win[i] + "+" + pos_win[i + 1] + "+" + pos_win[i + 2]); } outfile.write("|" + lepo + "\n"); } } } finally { if (pipesreader != null) { pipesreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TIMEE):\n\t" + e.toString() + " (line " + numline + ")\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } /** * Returns * A Dynamic n window context (default 9) syntactically motivated whenever it is possible... * * @return outputfilename */ public static String getDynamicWin(PipesFile pf) { String outputfile = pf.getFile().toString() + ".DynamicWin-features"; int numline = 0; try { // Window configuration int window_size = 9; // must be a odd number if (window_size % 2 != 1) { throw new Exception("Window size must be a positive odd number (1,3,5,7,etc.)"); } int half_win_size = (window_size - 1) / 2; BufferedReader pipesreader = new BufferedReader(new FileReader(pf.getFile())); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); ArrayList<String> sentence = null; int POScol = pf.getColumn("pos"); int wordcol = pf.getColumn("(word|token)"); int lemmacol = pf.getColumn("lemma"); int syntcolumn = pf.getColumn("synt"); ArrayList<Integer[]> mainPhrasesSpan = null; try { String curr_fileid = ""; String curr_sentN = ""; String line; String[] linearr; int numsent = 0; while ((line = pipesreader.readLine()) != null) { numline++; linearr = line.split("\\|"); if (curr_fileid.equals("")) { curr_fileid = linearr[0]; } if (curr_sentN.equals("")) { curr_sentN = linearr[1]; } if (curr_fileid.equals(linearr[0]) && curr_sentN.equals(linearr[1])) { if (sentence == null) { sentence = new ArrayList(); } sentence.add(line); } else { // update curr_markers curr_fileid = linearr[0]; curr_sentN = linearr[1]; //System.out.println("\n"); SyntColParser syntparser = new SyntColParser(); for (String sline : sentence) { linearr = sline.split("\\|"); // Synt boolean hasClosingBrakets = false; if (linearr[syntcolumn].indexOf(')') != -1) { hasClosingBrakets = true; } if (hasClosingBrakets) { syntparser.parse(linearr[syntcolumn].substring(0, linearr[syntcolumn].indexOf(')'))); } else { syntparser.parse(linearr[syntcolumn]); } if (hasClosingBrakets) { syntparser.parse(linearr[syntcolumn].substring(linearr[syntcolumn].indexOf(')'))); } } mainPhrasesSpan = syntparser.getMainPhrasesSpan(); //System.out.println("Processing "+curr_fileid+" "+curr_sentN+" "+linearr[0]+" "+linearr[1]+"\n"); String[] lemma_win = new String[window_size]; String[] pos_win = new String[window_size]; String lepo = "-"; int current_main_phrase = 0; Integer[] current_mainPhraseSpan = null; if (mainPhrasesSpan != null && mainPhrasesSpan.size() > 0) { current_mainPhraseSpan = new Integer[2]; current_mainPhraseSpan = mainPhrasesSpan.get(current_main_phrase); } for (int numtok = 0; numtok < sentence.size(); numtok++) { //System.out.println("processing token "+numtok+" size="+sentence.size()); if (mainPhrasesSpan != null && current_mainPhraseSpan != null && numtok > current_mainPhraseSpan[1]) { if (mainPhrasesSpan.size() > (current_main_phrase + 1)) { current_main_phrase++; current_mainPhraseSpan = mainPhrasesSpan.get(current_main_phrase); } } String[] numtok_linearr = sentence.get(numtok).split("\\|"); for (int i = 0; i < window_size; i++) { int position = numtok + i - half_win_size; if (position < 0 || position >= sentence.size()) { lemma_win[i] = "-"; pos_win[i] = "-"; } else { linearr = sentence.get(position).split("\\|"); if (((current_mainPhraseSpan != null && !numtok_linearr[POScol].matches("(V.*|AUX|MD)") && (position < current_mainPhraseSpan[0] || position > current_mainPhraseSpan[1] || numtok < current_mainPhraseSpan[0] || numtok > current_mainPhraseSpan[1])) || (numtok_linearr[POScol].matches("(V.*|AUX|MD)") && (position < numtok - 2 || position > numtok + 2 || !linearr[POScol].matches("(V.*|AUX|MD)")))) && position != numtok) { lemma_win[i] = "-"; pos_win[i] = "-"; } else { lemma_win[i] = linearr[lemmacol]; pos_win[i] = linearr[POScol]; if (position == numtok) { lepo = linearr[lemmacol] + "+" + linearr[POScol]; } } } } outfile.write(sentence.get(numtok)); // write lemma window for (int i = 0; i < window_size; i++) { outfile.write("|" + lemma_win[i]); } // write lemma bigrams for (int i = 0; i < window_size - 1; i++) { outfile.write("|" + lemma_win[i] + "+" + lemma_win[i + 1]); } // write lemma trigrams for (int i = 0; i < window_size - 2; i++) { outfile.write("|" + lemma_win[i] + "+" + lemma_win[i + 1] + "+" + lemma_win[i + 2]); } // write POS window for (int i = 0; i < window_size; i++) { outfile.write("|" + pos_win[i]); } // write POS bigrams for (int i = 0; i < window_size - 1; i++) { outfile.write("|" + pos_win[i] + "+" + pos_win[i + 1]); } // wirte POS trigrams for (int i = 0; i < window_size - 2; i++) { outfile.write("|" + pos_win[i] + "+" + pos_win[i + 1] + "+" + pos_win[i + 2]); } outfile.write("|" + lepo + "\n"); } numsent++; syntparser = null; sentence = null; sentence = new ArrayList(); sentence.add(line); } } if (sentence != null) { SyntColParser syntparser = new SyntColParser(); for (String sline : sentence) { linearr = sline.split("\\|"); // Synt boolean hasClosingBrakets = false; if (linearr[syntcolumn].indexOf(')') != -1) { hasClosingBrakets = true; } if (hasClosingBrakets) { syntparser.parse(linearr[syntcolumn].substring(0, linearr[syntcolumn].indexOf(')'))); } else { syntparser.parse(linearr[syntcolumn]); } if (hasClosingBrakets) { syntparser.parse(linearr[syntcolumn].substring(linearr[syntcolumn].indexOf(')'))); } } mainPhrasesSpan = syntparser.getMainPhrasesSpan(); String[] lemma_win = new String[window_size]; String[] pos_win = new String[window_size]; String lepo = "-"; int current_main_phrase = 0; Integer[] current_mainPhraseSpan = null; if (mainPhrasesSpan != null && mainPhrasesSpan.size() > 0) { current_mainPhraseSpan = new Integer[2]; current_mainPhraseSpan = mainPhrasesSpan.get(current_main_phrase); } for (int numtok = 0; numtok < sentence.size(); numtok++) { //System.out.println("processing token "+numtok+" size="+sentence.size()); if (current_mainPhraseSpan != null && current_mainPhraseSpan != null && numtok > current_mainPhraseSpan[1]) { if (mainPhrasesSpan.size() > (current_main_phrase + 1)) { current_main_phrase++; current_mainPhraseSpan = mainPhrasesSpan.get(current_main_phrase); } } String[] numtok_linearr = sentence.get(numtok).split("\\|"); for (int i = 0; i < window_size; i++) { int position = numtok + i - half_win_size; if (position < 0 || position >= sentence.size()) { lemma_win[i] = "-"; pos_win[i] = "-"; } else { linearr = sentence.get(position).split("\\|"); if (((current_mainPhraseSpan != null && !numtok_linearr[POScol].matches("(V.*|AUX|MD)") && (position < current_mainPhraseSpan[0] || position > current_mainPhraseSpan[1] || numtok < current_mainPhraseSpan[0] || numtok > current_mainPhraseSpan[1])) || (numtok_linearr[POScol].matches("(V.*|AUX|MD)") && (position < numtok - 2 || position > numtok + 2 || !linearr[POScol].matches("(V.*|AUX|MD)")))) && position != numtok) { lemma_win[i] = "-"; pos_win[i] = "-"; } else { lemma_win[i] = linearr[lemmacol]; pos_win[i] = linearr[POScol]; if (position == numtok) { lepo = linearr[lemmacol] + "+" + linearr[POScol]; } } } } outfile.write(sentence.get(numtok)); // write lemma window for (int i = 0; i < window_size; i++) { outfile.write("|" + lemma_win[i]); } // write lemma bigrams for (int i = 0; i < window_size - 1; i++) { outfile.write("|" + lemma_win[i] + "+" + lemma_win[i + 1]); } // write lemma trigrams for (int i = 0; i < window_size - 2; i++) { outfile.write("|" + lemma_win[i] + "+" + lemma_win[i + 1] + "+" + lemma_win[i + 2]); } // write POS window for (int i = 0; i < window_size; i++) { outfile.write("|" + pos_win[i]); } // write POS bigrams for (int i = 0; i < window_size - 1; i++) { outfile.write("|" + pos_win[i] + "+" + pos_win[i + 1]); } // wirte POS trigrams for (int i = 0; i < window_size - 2; i++) { outfile.write("|" + pos_win[i] + "+" + pos_win[i + 1] + "+" + pos_win[i + 2]); } outfile.write("|" + lepo + "\n"); } } } finally { if (pipesreader != null) { pipesreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TIMEE):\n\t" + e.toString() + " (line " + numline + ")\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } public static String GetPairSpecialTempEval2_features(PipesFile pipesfile, String plainmodel) { String outputfile = null; int linen = 0; try { boolean usingNotUTF8tool = false; outputfile = plainmodel + ".TempEval2-features"; String filename = ""; if (pipesfile.getLanguage().equalsIgnoreCase("EN")) { if (pipesfile.getFile().getAbsolutePath().contains(".roth")) { filename = pipesfile.getFile().getAbsolutePath().substring(pipesfile.getFile().getCanonicalPath().lastIndexOf('/') + 1, pipesfile.getFile().getAbsolutePath().lastIndexOf(".roth")); usingNotUTF8tool = true; } else { filename = pipesfile.getFile().getAbsolutePath().substring(pipesfile.getFile().getCanonicalPath().lastIndexOf('/') + 1, pipesfile.getFile().getAbsolutePath().lastIndexOf(".treetag")); } } if (pipesfile.getLanguage().equalsIgnoreCase("ES")) { filename = pipesfile.getFile().getAbsolutePath().substring(pipesfile.getFile().getCanonicalPath().lastIndexOf('/') + 1, pipesfile.getFile().getAbsolutePath().lastIndexOf(".freeling")); usingNotUTF8tool = true; } BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile())); BufferedReader modelreader = new BufferedReader(new FileReader(plainmodel)); try { int sentence = 0, tokn = 0; int tokcolumn = pipesfile.getColumn("(tok|word).*"); String pipesline; String[] pipesarr = null; char cmodel = '\0'; char cmodel_prev = '\0'; int offset = -1; boolean readmodel = true; /*int token_leading_blanks = 0;int token_leading_tabs = 0;int token_leading_newlines = 0;*/ String leadingBlanksString = ""; while ((pipesline = pipesreader.readLine()) != null) { linen++; //System.err.println(pipesline); if (pipesline.trim().length() > 1) { pipesarr = pipesline.split("\\|"); String token = pipesarr[tokcolumn]; int token_offset = -1; String paired_token = ""; int cn = 0; while (true) { if (readmodel) { cmodel_prev = cmodel; if ((cmodel = (char) modelreader.read()) == -1) { // NOTE: does not work if no EOF char \0A if (cn == token.length()) { break; // save last token end of file } else { throw new Exception("Premature end of model file"); } } if(cmodel=='\uffff'){ break; // save last token end of file } offset++; } else { readmodel = true; } char cpipes = '\0'; if (cn >= token.length()) { if (usingNotUTF8tool) { if (StringUtils.isISO_8859_1(cmodel)) { readmodel = false; break; } else { // delayed token mode for non-ISO desperate cases cpipes = 'a'; } } else { readmodel = false; break; } } else { cpipes = token.charAt(cn); } //System.out.println("offset=" + offset + " cmodel(" + cmodel + ") cpipes(" + cpipes + ")"); if (Character.toLowerCase(cpipes) == Character.toLowerCase(cmodel) || (cmodel == '|' && cpipes == '-')) { if (cmodel == '|') { paired_token += "|"; //scape char for features } else { paired_token += cmodel; } if (token_offset == -1) { token_offset = offset; } // multi-dashes problem ('---' is translated by e.g. Roth to '-') if (usingNotUTF8tool && cmodel == '-' && cn == token.length() - 1) { // read a new char (cmodel) if not end of file to check multi-dash if (!((cmodel = (char) modelreader.read()) == -1)) { readmodel = false; offset++; if (cmodel == '-') { cn--; } //if (cmodel == ' ' || cmodel == '\n' || cmodel == '\r' || cmodel == '\t') { //cn++; //readmodel = true; //} } } else { readmodel = true; } //readmodel = true; } else { if (cmodel == ' ' || cmodel == '\t' || cmodel == '\n' || cmodel == '\r') { cn--; /* DEPRECATED: if ((cmodel == ' ' || cmodel == '\t') && token_offset == -1) {token_leading_blanks++;}if (cmodel == '\n' && token_offset == -1) {token_leading_newlines++; }*/ if (token_offset == -1 && paired_token.equals("")) { if (cmodel == ' ') { leadingBlanksString += "s"; } else { if (cmodel == '\t') { leadingBlanksString += "t"; } else { if (cmodel == '\n') { leadingBlanksString += "n"; } else { if (Character.toLowerCase(cmodel) == '\r') { if ((cmodel = (char) modelreader.read()) != (char) -1) { offset++; if (Character.toLowerCase(cmodel) != '\n') { throw new Exception("End of pipesline not found (rn) " + "offset=" + offset + ". cmodel(" + cmodel + ") found instead."); } else { //DEPRECATED: token_leading_newlines++; leadingBlanksString += "n"; } } } } } } } else { // if (cmodel == ' ') {paired_token += " ";} // No please throw new Exception("A space, tab, or newline in the middle of the token cannot be paired, use UTF-8 NLP tools."); } } else { // Special for quotes (Roth translates " to `` or '') if (usingNotUTF8tool && (cmodel == '"' && ((cpipes == '`') || (cpipes == '\'')))) { if (cn + 1 < token.length() && cpipes == token.charAt(cn + 1)) { cn += 2; paired_token += cmodel; } } else { // Special for quotes2 (Roth sudenly changes '' by ``) if (usingNotUTF8tool && ((cmodel == '\'' || cmodel == '`') && (cpipes == '`' || cpipes == '\''))) { paired_token += cmodel; } else { // multi-dashes problem ('---' is translated by e.g. Roth to '-') if (usingNotUTF8tool && cmodel == '-' && cmodel_prev == '-') { paired_token += cmodel; readmodel = true; cn--; } else { // special for ISO NLP tools if (usingNotUTF8tool && !StringUtils.isISO_8859_1(cmodel)) { paired_token += cmodel; readmodel = true; cn--; } else { throw new Exception("Distinct chars " + paired_token + " offset=" + offset + " cmodel(" + cmodel + ") cpipes(" + cpipes + ")"); } } } } } } cn++; } // DEPRECATED: outfile.write(filename + "|" + sentence + "|" + tokn + "-" + token_leading_blanks + "-" + token_leading_newlines + "|" + paired_token); outfile.write(filename + "|" + sentence + "|" + tokn + "-" + leadingBlanksString + "|" + paired_token); for (int i = 1; i < pipesarr.length; i++) { outfile.write("|" + pipesarr[i]); } outfile.write("\n"); //DEPRECATED: token_leading_blanks = 0; token_leading_newlines = 0; leadingBlanksString = ""; tokn++; } else { // newline new sentence // DEPRECATED: outfile.write(pipesline + "\n"); // ommit this because of sentences tokn = 0; sentence++; } } } finally { if (pipesreader != null) { pipesreader.close(); } if (modelreader != null) { modelreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TIMEE):\n\t" + e.toString() + "- line:" + linen + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } public static String merge_classik(String extentsfile, String attribsfile, String attrib) { String outputfile = null; try { outputfile = extentsfile + ".TempEval2-features-annotatedWith-attribs"; //String extentsfile = pipesfile.getFile().getCanonicalPath().substring(0, pipesfile.getFile().getCanonicalPath().indexOf(".")) + "." + elemext + ".TempEval-extents"; PipesFile keypipes = new PipesFile(extentsfile); keypipes.isWellFormedOptimist(); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader extentsreader = new BufferedReader(new FileReader(extentsfile)); BufferedReader pipesreader = new BufferedReader(new FileReader(new File(attribsfile))); try { String extentline; String[] extentarr = null; String pipesline; String[] pipesarr = null; while ((extentline = extentsreader.readLine()) != null) { extentarr = extentline.split("\\|"); if (pipesarr == null && (pipesline = pipesreader.readLine()) != null) { pipesarr = pipesline.split("\\|"); } if (pipesarr != null) { if (pipesarr[0].equals(extentarr[0]) && pipesarr[1].equals(extentarr[1]) && pipesarr[2].equals(extentarr[2])) { outfile.write(extentline + "|" + attrib + "=" + pipesarr[pipesarr.length - 1] + "\n"); pipesarr = null; } else { outfile.write(extentline + "|-\n"); } } else { outfile.write(extentline + "|-\n"); } } } finally { if (pipesreader != null) { pipesreader.close(); } if (extentsreader != null) { extentsreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } public static String merge_classik_append(String appendfile, String attribsfile, String attrib) { String outputfile = null; try { outputfile = appendfile + ".TempEval2-features-annotatedWith-attribs-append"; //String extentsfile = pipesfile.getFile().getCanonicalPath().substring(0, pipesfile.getFile().getCanonicalPath().indexOf(".")) + "." + elemext + ".TempEval-extents"; String extentsfile = appendfile; PipesFile keypipes = new PipesFile(extentsfile); keypipes.isWellFormedOptimist(); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader extentsreader = new BufferedReader(new FileReader(extentsfile)); BufferedReader pipesreader = new BufferedReader(new FileReader(new File(attribsfile))); try { String extentline; String[] extentarr = null; String pipesline; String[] pipesarr = null; while ((extentline = extentsreader.readLine()) != null) { extentarr = extentline.split("\\|"); if (pipesarr == null && (pipesline = pipesreader.readLine()) != null) { pipesarr = pipesline.split("\\|"); } if (pipesarr != null) { if (pipesarr[0].equals(extentarr[0]) && pipesarr[1].equals(extentarr[1]) && pipesarr[2].equals(extentarr[2])) { outfile.write(extentline + ";" + attrib + "=" + pipesarr[pipesarr.length - 1] + "\n"); pipesarr = null; } else { outfile.write(extentline + "\n"); } } else { outfile.write(extentline + "\n"); } } } finally { if (pipesreader != null) { pipesreader.close(); } if (extentsreader != null) { extentsreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } /** * The priority is on the file of the first paramenter * */ static String merge_pipes(String primary, String secondary) { String outputfile = null; try { outputfile = primary + "-merged"; BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader extentsreader = new BufferedReader(new FileReader(primary)); BufferedReader extentsreader2 = new BufferedReader(new FileReader(secondary)); try { String extentline; String[] extentarr = null; String extentline2; String[] extentarr2 = null; PipesFile keypipes = new PipesFile(primary); keypipes.isWellFormedOptimist(); int iob2col1 = keypipes.getColumn("element\\(IOB2\\)"); keypipes = new PipesFile(secondary); keypipes.isWellFormedOptimist(); int iob2col2 = keypipes.getColumn("element\\(IOB2\\)"); boolean firstO = true; while ((extentline = extentsreader.readLine()) != null) { extentarr = extentline.split("\\|"); if ((extentline2 = extentsreader2.readLine()) == null) { throw new Exception("Secondary file ended prematurely."); } extentarr2 = extentline2.split("\\|"); if (!extentarr[iob2col1].equals("O")) { if (iob2col1 == (extentarr.length - 1)) { outfile.write(extentline + "|-\n"); } else { outfile.write(extentline + "\n"); } firstO = true; } else { if (firstO && extentarr2[iob2col2].startsWith("I-")) { String tmpelem = extentarr2[iob2col2].substring(2); extentline2 = extentline2.replaceAll("\\|I-" + tmpelem, "\\|B-" + tmpelem); } if (iob2col2 == (extentarr2.length - 1)) { outfile.write(extentline2 + "|-\n"); } else { outfile.write(extentline2 + "\n"); } firstO = false; } } } finally { if (extentsreader != null) { extentsreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } public static String putids(String pipes) { String outputfile = null; try { outputfile = pipes + "-ids"; HashMap<String, Integer> ids = new HashMap<String, Integer>(); PipesFile keypipes = new PipesFile(pipes); keypipes.isWellFormedOptimist(); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader extentsreader = new BufferedReader(new FileReader(pipes)); try { String extentline; String[] extentarr = null; int iob2col = keypipes.getColumn("element\\(IOB2\\)"); while ((extentline = extentsreader.readLine()) != null) { extentarr = extentline.split("\\|"); if (extentarr[iob2col].startsWith("B-")) { String element = extentarr[iob2col].substring(extentarr[iob2col].lastIndexOf("-") + 1).toLowerCase(); Integer id = 1; if (ids.containsKey(element)) { ids.put(element, ids.get(element) + 1); id = ids.get(element); } else { ids.put(element, 1); } if (iob2col == (extentarr.length - 1)) { outfile.write(extentline + "|" + element.substring(0, 1) + "id=" + element.substring(0, 1) + id + "\n"); } else { if (extentarr[iob2col + 1].length() > 1) { outfile.write(extentline + ";" + element.substring(0, 1) + "id=" + element.substring(0, 1) + id + "\n"); } else { outfile.write(extentline.substring(0, extentline.lastIndexOf("|") + 1) + element.substring(0, 1) + "id=" + element.substring(0, 1) + id + "\n"); } } } else { if (iob2col == (extentarr.length - 1)) { outfile.write(extentline + "|-\n"); } else { outfile.write(extentline + "\n"); } } } } finally { if (extentsreader != null) { extentsreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } public static void clean(String dir) { Process p; try { String[] command3 = {"/bin/sh", "-c", "rm -rf " + dir + "*TempEval2* " + dir + "*roleconf*"}; p = Runtime.getRuntime().exec(command3); BufferedReader stdInput = new BufferedReader(new InputStreamReader(p.getInputStream())); try { String line; while ((line = stdInput.readLine()) != null) { System.err.println(line); } } finally { if (stdInput != null) { stdInput.close(); } } } catch (Exception e) { System.err.println("Errors found (TIMEE):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } } } } // Remove if there are no problems /* public static String GetPairSpecialTempEval2_features_treetagger(PipesFile pipesfile, String plainmodel) { String outputfile = null; try { outputfile = plainmodel + ".TempEval2-features"; String filename = ""; if (pipesfile.getLanguage().equalsIgnoreCase("EN")) { filename = pipesfile.getFile().getAbsolutePath().substring(pipesfile.getFile().getCanonicalPath().lastIndexOf('/') + 1, pipesfile.getFile().getAbsolutePath().lastIndexOf(".treetag")); } BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile())); BufferedReader modelreader = new BufferedReader(new FileReader(plainmodel)); try { int sentence = 0, tokn = 0; int tokcolumn = pipesfile.getColumn("(tok|word).*"); String pipesline; String[] pipesarr = null; char cmodel = '\0'; int offset = -1; boolean readmodel = true; int token_leading_blanks = 0; int token_leading_newlines = 0; while ((pipesline = pipesreader.readLine()) != null) { //System.out.println(pipesline); if (pipesline.trim().length() > 1) { pipesarr = pipesline.split("\\|"); String token = pipesarr[tokcolumn]; int token_offset = -1; String paired_token = ""; for (int cn = 0; cn < token.length(); cn++) { char cpipes = token.charAt(cn); if (readmodel) { if ((cmodel = (char) modelreader.read()) == -1) { throw new Exception("Premature end of model file"); } offset++; } else { readmodel = true; } //System.out.println("offset=" + offset + " cmodel(" + cmodel + ") cpipes(" + cpipes + ")"); if (Character.toLowerCase(cpipes) == Character.toLowerCase(cmodel)) { paired_token += cmodel; if (token_offset == -1) { token_offset = offset; } // multi-dashes problem if (cmodel == '-' && cn == token.length() - 1) { // read a new char (cmodel) if not end of file to check multi-dash if (!((cmodel = (char) modelreader.read()) == -1)) { readmodel = false; offset++; if (cmodel == '-') { cn--; } if (cmodel == ' ' || cmodel == '\n' || cmodel == '\r' || cmodel == '\t') { cn++; readmodel = true; } } } } else { if (cmodel == ' ' || cmodel == '\t' || cmodel == '\n' || cmodel == '\r') { cn--; if ((cmodel == ' ' || cmodel == '\t') && token_offset == -1) { token_leading_blanks++; } if (cmodel == '\n' && token_offset == -1) { token_leading_newlines++; } } else { // Special for quotes if (cmodel == '"' && ((cpipes == '`') || (cpipes == '\''))) { if (cn + 1 < token.length() && cpipes == token.charAt(cn + 1)) { cn += 2; paired_token += cmodel; } } else { if (((cmodel == '\'' || cmodel == '`') && (cpipes == '`' || cpipes == '\'')) || (cmodel == '—')) { paired_token += cmodel; } else { throw new Exception("Distinct chars " + paired_token + " offset=" + offset + " cmodel(" + cmodel + ") cpipes(" + cpipes + ")"); } } } } } /////////////////////////////////////////// //outfile.write(pipesfile.getFile().getName()+"|"+sentence+"|"+token); if (!pipesarr[1].equalsIgnoreCase("NP")) { pipesarr[2] = pipesarr[2].toLowerCase(); // lemma to lower case } outfile.write(filename + "|" + sentence + "|" + tokn + "-" + token_leading_blanks + "-" + token_leading_newlines + "|" + pipesarr[0] + "|" + pipesarr[1] + "|-|-|-|-|" + pipesarr[2] + "|-|-|-|-|-|-|-|-|-|-|-|-\n"); token_leading_blanks = 0; token_leading_newlines = 0; tokn++; } else { // newline new sentence //System.out.println("cmodel(" + cmodel + ")"); if (Character.toLowerCase(cmodel) != '\n' && Character.toLowerCase(cmodel) != '\r' && Character.toLowerCase(cmodel) != ' ' && Character.toLowerCase(cmodel) != '\t') { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Ignore Treetager newline"); } } else { if (Character.toLowerCase(cmodel) == ' ' || Character.toLowerCase(cmodel) != '\t') { token_leading_blanks++; } if (Character.toLowerCase(cmodel) == '\n') { token_leading_newlines++; } if (Character.toLowerCase(cmodel) == '\r') { if ((cmodel = (char) modelreader.read()) != (char) -1) { offset++; if (Character.toLowerCase(cmodel) != '\n') { throw new Exception("End of pipesline not found (rn) " + "offset=" + offset + ". cmodel(" + cmodel + ") found instead."); } else { token_leading_newlines++; } } } } //outfile.write(pipesline + "\n"); // ommit this because of sentences tokn = 0; sentence++; } } } finally { if (pipesreader != null) { pipesreader.close(); } if (modelreader != null) { modelreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (TIMEE):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } */