package com.cognitionis.feature_builder; import com.cognitionis.knowledgek.TIMEK.TIMEK; import com.cognitionis.nlp_files.PipesFile; import com.cognitionis.utils_basickit.XmlAttribs; import java.io.*; import java.util.*; /** * * @author Héctor Llorens * @since 2011 */ public class Classification { public static void classik_clear_features(HashMap<String, String> features) { features.put("file", "-"); // the first features.put("sentN", "-"); // the first features.put("tokN", "-"); // the first features.put("word", ""); // combination features.put("pos", "-"); // combination features.put("lemma", "-"); // combination features.put("roleconf", "-"); // the first features.put("simpleroles", "-"); // the distinct features.put("depverb", "-"); // the first features.put("tense", "-"); // the first features.put("polarity", "-"); // the first features.put("mainphrase", "-"); // the distinct features.put("PPdetail", "-"); // the first features.put("wn", "-"); // the timerelatedone or the last features.put("element", "-"); // the first } public static String get_classik(String features_and_attributes, String lang) { String output; PipesFile nlpfile = new PipesFile(features_and_attributes); ((PipesFile) nlpfile).isWellFormedOptimist(); output = getClassikFormat((PipesFile) nlpfile, new Locale(lang)); return output; } public static String getClassikFormat(PipesFile pipesfile, Locale l) { String outputfile = null; int numline = 0; try { outputfile = pipesfile.getFile().getCanonicalPath() + ".TempEval-classik-features"; BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader pipesreader = new BufferedReader(new FileReader(pipesfile.getFile())); TIMEK timek = new TIMEK(l); HashMap<String, String> features = new HashMap<String, String>(); int iob2col = pipesfile.getColumn("element\\(IOB2\\)"); int attrscol = iob2col + 1; int tempevalfilecol = pipesfile.getColumn("file"); int tokencol = pipesfile.getColumn("(word|token)"); int POScol = pipesfile.getColumn("(POS|pos)"); int lemmacol = pipesfile.getColumn("lemma"); int roleconfcol = pipesfile.getColumn("roleconf"); int simplerolescol = pipesfile.getColumn("simplerolesIOB2"); int tensecol = pipesfile.getColumn("tense"); int ppdetailcol = pipesfile.getColumn("PPdetail"); int depverbcol = pipesfile.getColumn("depverb"); int polaritycol = pipesfile.getColumn("assertype"); int mainphrasecol = pipesfile.getColumn("iobmainphrase"); int wncol = pipesfile.getColumn("wn"); Boolean attribsCheck = false; if (iob2col == -1 || tokencol == -1 || POScol == -1) { String notFoundCol = ""; if (iob2col == -1) { notFoundCol += "element,attribs,"; } if (tokencol == -1) { notFoundCol += "word/token,"; } if (POScol == -1) { notFoundCol += "POS,"; } throw new Exception("Some of the required columns (element,word/token,POS) not found: " + notFoundCol); } String pipesline; String[] pipesarr = null; String tempexNorm = ""; String tempexPattern = ""; classik_clear_features(features); HashMap<String, String> tempexAttribsHash = null; String pipeslineant = "--prior first line--"; ArrayList<String> sentence = null; HashMap<String, String> VerbeventDep = null; try { String curr_fileid = ""; String curr_sentN = ""; String line; String[] linearr; int numsent = 0; while ((line = pipesreader.readLine()) != null) { numline++; linearr = line.split("\\|"); if (curr_fileid.equals("")) { curr_fileid = linearr[0]; } if (curr_sentN.equals("")) { curr_sentN = linearr[1]; } //System.out.println(curr_fileid+" "+curr_sentN+" "+linearr[0]+" "+linearr[1]+"\n"); if (curr_fileid.equals(linearr[0]) && curr_sentN.equals(linearr[1])) { //System.out.println(curr_fileid+" adding "+curr_sentN+"\n"); if (sentence == null) { sentence = new ArrayList(); VerbeventDep = new HashMap<String, String>(); } sentence.add(line); // GUARDAR VERB-EVENT-A1 DEPENDENCIES // ARRAY AMB TOTS ELS VERBS Q TINGUEN EVENTS A1 //pipesarr = line.split("\\|"); if (line.endsWith("B-event") && !linearr[POScol].matches("(V.*|AUX)")) { //System.out.println(linearr[lemmacol] + " " + linearr[depverbcol] + " " + line); VerbeventDep.put(linearr[depverbcol], linearr[lemmacol] + "|" + linearr[POScol] + "|" + linearr[wncol] + "|" + linearr[simplerolescol]); } } else { // update curr_markers curr_fileid = linearr[0]; curr_sentN = linearr[1]; //System.out.println("Processing "+curr_fileid+" "+curr_sentN+" "+linearr[0]+" "+linearr[1]+"\n"); for (int numtok = 0; numtok < sentence.size(); numtok++) { //System.out.println("processing token "+numtok+" size="+sentence.size()); pipesline = sentence.get(numtok); pipesarr = pipesline.split("\\|"); if (!attribsCheck && pipesarr.length >= pipesfile.getPipesDescArrCount()) { if (iob2col == pipesarr.length - 1) { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("No attribs found. Formating file for testing"); } } else { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Attribs found. Formating file for training"); } features.put("classik", "-"); } attribsCheck = true; } if (pipesarr.length >= pipesfile.getPipesDescArrCount()) { if (pipesarr[iob2col].matches("B-.*")) { if (!features.get("word").equals("")) { outfile.write(features.get("file") + "|" + features.get("sentN") + "|" + features.get("tokN") + "|" + features.get("word") + "|" + features.get("pos") + "|" + features.get("lemma") + "|" + features.get("roleconf") + "|" + features.get("simpleroles") + "|" + features.get("depverb") + "|" + features.get("tense") + "|" + features.get("polarity") + "|" + features.get("mainphrase") + "|" + features.get("PPdetail") + "|" + features.get("wn")); // specific features if (features.get("element").matches("(?i)timex.*")) { String normalizedTIMEK = timek.getNormTextandPattern(features.get("word")); String[] normalizedarr = normalizedTIMEK.split("\\|"); tempexNorm = normalizedarr[0]; tempexPattern = normalizedarr[1]; String[] tempexarr = tempexNorm.split("_"); String[] tempexNUarr = tempexPattern.split("_"); String granularity = "-"; String set_indicator = "0"; if (tempexNorm.matches(timek.SET_re)) { set_indicator = "1"; } if (tempexNUarr[tempexNUarr.length - 1].equalsIgnoreCase("TUNIT")) { granularity = "granul_date"; } if (tempexarr[tempexarr.length - 1].matches(timek.TOD_re)) { granularity = "granul_time"; } // normalizedtext | pattern | lastword|lastNU|lastwordgranularity|setinicator outfile.write("|" + tempexNorm + "|" + tempexPattern + "|" + tempexarr[tempexarr.length - 1] + "|" + tempexNUarr[tempexNUarr.length - 1] + "|" + granularity + "|" + set_indicator); } else { if (features.get("element").matches("(?i)event") && features.get("pos").matches("(V.*|AUX)") && VerbeventDep.containsKey(features.get("lemma"))) { outfile.write("|1|" + VerbeventDep.get(features.get("lemma"))+"|-"); } else { outfile.write("|-|-|-|-|-|-"); } } outfile.write("|" + features.get("element")); if (features.containsKey("classik")) { outfile.write("|" + features.get("classik")); } outfile.write("\n"); tempexNorm = ""; tempexPattern = ""; classik_clear_features(features); } String element = pipesarr[iob2col].substring(2); if (tempevalfilecol != -1) { features.put("file", pipesarr[tempevalfilecol]); // the first features.put("sentN", pipesarr[tempevalfilecol + 1]); // the first features.put("tokN", pipesarr[tempevalfilecol + 2]); // the first } features.put("word", pipesarr[tokencol]); features.put("pos", pipesarr[POScol]); features.put("lemma", pipesarr[lemmacol]); features.put("roleconf", pipesarr[roleconfcol]); features.put("simpleroles", pipesarr[simplerolescol]); features.put("depverb", pipesarr[depverbcol]); features.put("tense", pipesarr[tensecol]); features.put("polarity", pipesarr[polaritycol]); features.put("mainphrase", pipesarr[mainphrasecol]); features.put("PPdetail", pipesarr[ppdetailcol]); features.put("wn", pipesarr[wncol]); features.put("element", element); if (features.containsKey("classik")) { if (pipesarr[attrscol].matches(".*=.*=.*") && !pipesarr[attrscol].contains(";")) { tempexAttribsHash = XmlAttribs.parseXMLattrs(pipesarr[attrscol]); } else { tempexAttribsHash = XmlAttribs.parseSemiColonAttrs(pipesarr[attrscol]); } if (pipesarr[iob2col].matches("(?i).*timex.*")) { features.put("classik", tempexAttribsHash.get("type")); } else { features.put("classik", tempexAttribsHash.get("class")); } } } if (pipesarr[iob2col].matches("I-.*")) { if (features.get("word").equals("")) { throw new Exception("Malformed annotation: " + pipesline + "\n Prev: " + pipeslineant); } features.put("word", features.get("word") + "_" + pipesarr[tokencol]); features.put("pos", features.get("pos") + "_" + pipesarr[POScol]); features.put("lemma", features.get("lemma") + "_" + pipesarr[lemmacol]); String[] roles = features.get("simpleroles").split("_"); if (!roles[roles.length - 1].equals(pipesarr[simplerolescol])) { features.put("simpleroles", features.get("simpleroles") + "_" + pipesarr[simplerolescol]); } String[] phrases = features.get("mainphrase").split("_"); if (!phrases[phrases.length - 1].equals(pipesarr[mainphrasecol])) { features.put("mainphrase", features.get("mainphrase") + "_" + pipesarr[mainphrasecol]); } if (!features.get("wn").matches(".*(time|tiempo|periodo).*")) { features.put("wn", pipesarr[wncol]); } } } pipeslineant = pipesline; } numsent++; sentence = null; VerbeventDep = null; sentence = new ArrayList(); sentence.add(line); VerbeventDep = new HashMap<String, String>(); if (line.endsWith("B-event") && !linearr[POScol].matches("(V.*|AUX)")) { //System.out.println(line); VerbeventDep.put(linearr[depverbcol], linearr[lemmacol] + "|" + linearr[POScol] + "|" + linearr[wncol] + "|" + linearr[simplerolescol]); } } } if (sentence != null) { for (int numtok = 0; numtok < sentence.size(); numtok++) { pipesline = sentence.get(numtok); pipesarr = pipesline.split("\\|"); if (!attribsCheck && pipesarr.length >= pipesfile.getPipesDescArrCount()) { if (iob2col == pipesarr.length - 1) { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("No attribs found. Formating file for testing"); } } else { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Attribs found. Formating file for training"); } features.put("classik", "-"); } attribsCheck = true; } if (pipesarr.length >= pipesfile.getPipesDescArrCount()) { if (pipesarr[iob2col].matches("B-.*")) { if (!features.get("word").equals("")) { outfile.write(features.get("file") + "|" + features.get("sentN") + "|" + features.get("tokN") + "|" + features.get("word") + "|" + features.get("pos") + "|" + features.get("lemma") + "|" + features.get("roleconf") + "|" + features.get("simpleroles") + "|" + features.get("depverb") + "|" + features.get("tense") + "|" + features.get("polarity") + "|" + features.get("mainphrase") + "|" + features.get("PPdetail") + "|" + features.get("wn")); if (features.get("element").matches("(?i)timex.*")) { String normalizedTIMEK = timek.getNormTextandPattern(features.get("word")); String[] normalizedarr = normalizedTIMEK.split("\\|"); tempexNorm = normalizedarr[0]; tempexPattern = normalizedarr[1]; String[] tempexarr = tempexNorm.split("_"); String[] tempexNUarr = tempexPattern.split("_"); String granularity = "-"; String set_indicator = "0"; if (tempexNorm.matches(timek.SET_re)) { set_indicator = "1"; } if (tempexNUarr[tempexNUarr.length - 1].equalsIgnoreCase("TUNIT")) { granularity = "granul_date"; } if (tempexarr[tempexarr.length - 1].matches(timek.TOD_re)) { granularity = "granul_time"; } // normalizedtext | pattern | lastword|lastNU|lastwordgranularity|setinicator outfile.write("|" + tempexNorm + "|" + tempexPattern + "|" + tempexarr[tempexarr.length - 1] + "|" + tempexNUarr[tempexNUarr.length - 1] + "|" + granularity + "|" + set_indicator); } else { if (features.get("element").matches("(?i)event") && features.get("pos").matches("(V.*|AUX)") && VerbeventDep.containsKey(features.get("lemma"))) { outfile.write("|1|" + VerbeventDep.get(features.get("lemma"))+"|-"); } else { outfile.write("|-|-|-|-|-|-"); } } outfile.write("|" + features.get("element")); if (features.containsKey("classik")) { outfile.write("|" + features.get("classik")); } outfile.write("\n"); tempexNorm = ""; tempexPattern = ""; classik_clear_features(features); } String element = pipesarr[iob2col].substring(2); if (tempevalfilecol != -1) { features.put("file", pipesarr[tempevalfilecol]); // the first features.put("sentN", pipesarr[tempevalfilecol + 1]); // the first features.put("tokN", pipesarr[tempevalfilecol + 2]); // the first } features.put("word", pipesarr[tokencol]); features.put("pos", pipesarr[POScol]); features.put("lemma", pipesarr[lemmacol]); features.put("roleconf", pipesarr[roleconfcol]); features.put("simpleroles", pipesarr[simplerolescol]); features.put("depverb", pipesarr[depverbcol]); features.put("tense", pipesarr[tensecol]); features.put("polarity", pipesarr[polaritycol]); features.put("mainphrase", pipesarr[mainphrasecol]); features.put("PPdetail", pipesarr[ppdetailcol]); features.put("wn", pipesarr[wncol]); features.put("element", element); if (features.containsKey("classik")) { if (pipesarr[attrscol].matches(".*=.*=.*") && !pipesarr[attrscol].contains(";")) { tempexAttribsHash = XmlAttribs.parseXMLattrs(pipesarr[attrscol]); } else { tempexAttribsHash = XmlAttribs.parseSemiColonAttrs(pipesarr[attrscol]); } if (pipesarr[iob2col].matches("(?i).*timex.*")) { features.put("classik", tempexAttribsHash.get("type")); } else { features.put("classik", tempexAttribsHash.get("class")); } } } if (pipesarr[iob2col].matches("I-.*")) { if (features.get("word").equals("")) { throw new Exception("Malformed annotation: " + pipesline + "\n Prev: " + pipeslineant); } features.put("word", features.get("word") + "_" + pipesarr[tokencol]); features.put("pos", features.get("pos") + "_" + pipesarr[POScol]); features.put("lemma", features.get("lemma") + "_" + pipesarr[lemmacol]); String[] roles = features.get("simpleroles").split("_"); if (!roles[roles.length - 1].equals(pipesarr[simplerolescol])) { features.put("simpleroles", features.get("simpleroles") + "_" + pipesarr[simplerolescol]); } String[] phrases = features.get("mainphrase").split("_"); if (!phrases[phrases.length - 1].equals(pipesarr[mainphrasecol])) { features.put("mainphrase", features.get("mainphrase") + "_" + pipesarr[mainphrasecol]); } if (!features.get("wn").matches(".*(time|tiempo|periodo).*")) { features.put("wn", pipesarr[wncol]); } } } pipeslineant = pipesline; } } if (!features.get("word").equals("")) { outfile.write(features.get("file") + "|" + features.get("sentN") + "|" + features.get("tokN") + "|" + features.get("word") + "|" + features.get("pos") + "|" + features.get("lemma") + "|" + features.get("roleconf") + "|" + features.get("simpleroles") + "|" + features.get("depverb") + "|" + features.get("tense") + "|" + features.get("polarity") + "|" + features.get("mainphrase") + "|" + features.get("PPdetail") + "|" + features.get("wn")); if (features.get("element").matches("(?i)timex.*")) { String normalizedTIMEK = timek.getNormTextandPattern(features.get("word")); String[] normalizedarr = normalizedTIMEK.split("\\|"); tempexNorm = normalizedarr[0]; tempexPattern = normalizedarr[1]; String[] tempexarr = tempexNorm.split("_"); String[] tempexNUarr = tempexPattern.split("_"); String granularity = "-"; String set_indicator = "0"; if (tempexNorm.matches(timek.SET_re)) { set_indicator = "1"; } if (tempexNUarr[tempexNUarr.length - 1].equalsIgnoreCase("TUNIT")) { granularity = "granul_date"; } if (tempexarr[tempexarr.length - 1].matches(timek.TOD_re)) { granularity = "granul_time"; } // normalizedtext | pattern | lastword|lastNU|lastwordgranularity|setinicator outfile.write("|" + tempexNorm + "|" + tempexPattern + "|" + tempexarr[tempexarr.length - 1] + "|" + tempexNUarr[tempexNUarr.length - 1] + "|" + granularity + "|" + set_indicator); } else { if (features.get("element").matches("(?i)event") && features.get("pos").matches("(V.*|AUX)") && VerbeventDep.containsKey(features.get("lemma"))) { outfile.write("|1|" + VerbeventDep.get(features.get("lemma"))+"|-"); } else { outfile.write("|-|-|-|-|-|-"); } } outfile.write("|" + features.get("element")); if (features.containsKey("classik")) { outfile.write("|" + features.get("classik")); } outfile.write("\n"); tempexNorm = ""; tempexPattern = ""; classik_clear_features(features); } } finally { if (pipesreader != null) { pipesreader.close(); } if (outfile != null) { outfile.close(); } } } catch (Exception e) { System.err.println("Errors found (CLASSIK):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } }