package com.cognitionis.nlp_files; import com.cognitionis.utils_basickit.StringUtils; import java.io.*; import java.nio.file.*; import static java.nio.file.StandardCopyOption.*; import java.util.*; /** * TokenizedFile consists of a first column of tokens (index) and numfields * optional number of fields separated by field_separator_re Example: * Tokenization, Annotation, Features,... the fields in these files are commonly * separated by space, tab or pipe (|) * * @author Héctor Llorens * @since 2011 */ public class TokenizedFile extends NLPFile { private int numfields; private String field_separator_re; private String field_separator_out; public TokenizedFile(String filename) { this(filename, "\\s+", 0); } public TokenizedFile(String filename, String separator) { this(filename, separator, 0); } public TokenizedFile(String filename, String separator, int n) { super(filename); isWellFormatted = false; numfields = n; field_separator_re = separator; field_separator_out = obtainFieldSeparatorOut(); isWellFormatted(); // obligatory step (gets numfields) } @Override public Boolean isWellFormatted() { try { if (super.getFile() == null) { throw new Exception("No file loaded in NLPFile object"); } try (BufferedReader reader = new BufferedReader(new FileReader(this.f))) { String line; int linen = 0; while ((line = reader.readLine()) != null) { line = line.trim(); linen++; if (line.length() != 0 && numfields == 0) { numfields = line.split(field_separator_re).length; } if (line.length() != 0 && numfields != 0) { if (numfields != line.split(field_separator_re).length) { throw new Exception("Line " + linen + " (" + line + "): Different number of fields. Expected: " + numfields + " - Found:" + line.split("\\|").length); } } } if(numfields==0) throw new Exception("Empty file: "+this.f); } } catch (Exception e) { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } else { System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n"); } this.isWellFormatted = false; return false; } this.isWellFormatted = true; return true; } @Override public String toPlain(String filename) { throw new UnsupportedOperationException("This will consist of transform tokens to plain text if possible (will look for a column with offset or spacing info). Generated by an alignment"); //To change body of generated methods, choose Tools | Templates. } public int getLastDescColumn() { return numfields - 1; } public int getNumFields() { return numfields; } public String getFieldSeparatorRE() { return field_separator_re; } public String getFieldSeparatorOut() { return field_separator_out; } public String obtainFieldSeparatorOut() { field_separator_out = " "; if (!field_separator_re.matches("^.*(\\\\s|[+*]).*$")) { field_separator_out = field_separator_re; } return field_separator_out; } public HashMap<String,Integer> getTokenCount(){ return getTokenCount(0); } public HashMap<String,Integer> getTokenCount(int field){ HashMap<String,Integer> token_count=new HashMap<>(); try { if(field>this.numfields) throw new Exception("token field ("+field+") is bigger than number of fields: "+this.numfields); try (BufferedReader reader = new BufferedReader(new FileReader(this.f))) { String line; while ((line = reader.readLine()) != null) { line = line.trim(); if (line.length() != 0) { String token = line.split(field_separator_re)[field]; Integer current_token_count=token_count.get(token); if(current_token_count!=null){ token_count.put(token, current_token_count +1); }else{ token_count.put(token, 1); } } } if(numfields==0) throw new Exception("Empty file: "+this.f); } } catch (Exception e) { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } else { System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n"); } } return token_count; } public String add_IOB_from_XML(String xmlfile, String root_tag, String elements_re, String attribs_re, String mergeattrib) { String field_separator_out = " "; if (!field_separator_re.matches("^.*(\\\\s|[+*]).*$")) { field_separator_out = field_separator_re; } String outputfile = this.getFile().toString() + "-annotationKey"; if (elements_re == null) { elements_re = ".*"; } if (!elements_re.equals(".*")) { outputfile += "-" + elements_re; } if (attribs_re != null && !attribs_re.equals(".*")) { outputfile += "-" + attribs_re.replaceAll("([.]?\\*|[\"=])", "").replace('|', '_'); } if (mergeattrib != null) { outputfile += "-" + mergeattrib; } try { if (!this.isWellFormatted) { throw new Exception("Malformed file"); } int tokcolumn = 0; int last_desc_column = this.getLastDescColumn(); boolean hasRoot_tag = false; char cxml = '\0'; String line; String tag = "", attribs = "-", inTag = "", inAttribs = "-"; //boolean closingtag = false; char BIO = 'O'; try (BufferedReader tokenizedreader = new BufferedReader(new FileReader(this.f)); BufferedReader xmlreader = new BufferedReader(new FileReader(xmlfile)); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));) { // find root tag while (true) { if ((cxml = (char) xmlreader.read()) == -1) { throw new Exception("Premature end of model file"); } if (cxml == '<') { if ((cxml = (char) xmlreader.read()) == -1) { throw new Exception("Premature end of model file"); } do { tag += cxml; if ((cxml = (char) xmlreader.read()) == -1) { throw new Exception("Premature end of model file"); } } while (cxml != '>'); if (tag.equalsIgnoreCase(root_tag)) { hasRoot_tag = true; break; } tag = ""; } //System.err.print(cxml); } if (!hasRoot_tag) { throw new Exception("Root tag " + root_tag + " not found"); } tag = ""; cxml = '\0'; while ((line = tokenizedreader.readLine()) != null) { line = line.trim(); if (line.length() != 0) { String[] linearr = line.split(field_separator_re); if (linearr.length >= 1) { //System.out.println(line); String token = linearr[tokcolumn]; boolean interTokenTag = false; boolean findtokenIter = false; boolean delayed_closing = false; char prevxmlchar = 'x'; char prevprevxmlchar = 'x'; for (int cn = 0; cn < token.length(); cn++) { char cpipes = token.charAt(cn); prevprevxmlchar = prevxmlchar; prevxmlchar = cxml; if ((cxml = (char) xmlreader.read()) == -1) { throw new Exception("Premature end of model file"); } //System.err.println("cxml(" + cxml + ") cpipes(" + cpipes + "," + cn + ") "+inTag); if (Character.toLowerCase(cpipes) != Character.toLowerCase(cxml)) { if (cxml == ' ' || cxml == '\n' || cxml == '\r' || cxml == '\t') { cn--; //System.err.println("blank found cn="+cn); } else { // tags handling if (cxml == '<') { if (cn != 0) { interTokenTag = true; } cn--; while (((cxml = (char) xmlreader.read()) != (char) -1) && cxml != '>') { tag += cxml; } tag = tag.trim(); if (tag.indexOf(' ') != -1) { attribs = tag.substring(tag.indexOf(' ') + 1); tag = tag.substring(0, tag.indexOf(' ')); } //System.err.println("tag=" + tag + " attribs=" + attribs); if (tag.matches("(?i)" + elements_re) && !tag.startsWith("/")) { findtokenIter = true; //System.err.println("LOOKING opening tag=" + tag + " attribs=" + attribs); if (interTokenTag) { System.err.println(xmlfile + " Inter-token (" + cn + ") tag consider manual tokenizing: " + token); } if (!inTag.equals("")) { throw new Exception(xmlfile + " Nested tags (" + tag + "/" + inTag + ") consider manual correction " + token); } inTag = tag; inAttribs = attribs; tag = ""; attribs = "-"; BIO = 'B'; if (attribs_re != null && !inAttribs.matches("(?i)" + attribs_re)) { BIO = 'O'; inTag = ""; inAttribs = "-"; findtokenIter = false; interTokenTag = false; } if (mergeattrib != null) { String tmpattrib = inAttribs.substring(inAttribs.indexOf(mergeattrib + "=")).substring(mergeattrib.length() + 1); tmpattrib = tmpattrib.replace("\"", ""); if (tmpattrib.indexOf(' ') != -1) { tmpattrib = tmpattrib.substring(0, tmpattrib.indexOf(' ')); } inTag = inTag + "+" + tmpattrib; } /* if (inTag.equals("EVENT")) { inAttribs = inAttribs.substring(inAttribs.indexOf("class=")).substring(6); inAttribs = inAttribs.replace("\"", ""); if (inAttribs.indexOf(' ') != -1) { inAttribs = inAttribs.substring(0, inAttribs.indexOf(' ')); } }*/ } else { interTokenTag = false; /*if (tag.contains("TIMEX3") && !tag.matches("/" + inTag)) { System.err.println("problema:" + tag + " intag:" + inTag); System.exit(1); }*/ } // check if closing if (tag.matches("/.*")) { String check = inTag; if (mergeattrib != null && inTag.matches(".+\\+.+")) { check = inTag.substring(0, inTag.indexOf('+')); } if (tag.matches("/" + "(?i)" + check)) { if (findtokenIter) { // safe for empty tags (events_4_instances and timex3_4_durations) if (cn >= 0) { System.err.println(xmlfile + " Inter Token end of tag (" + inTag + ") cn=" + cn + " " + line); delayed_closing = true; } else { BIO = 'O'; inTag = ""; inAttribs = "-"; findtokenIter = false; interTokenTag = false; } } else { //System.err.println("closing tag=" + inTag); BIO = 'O'; inTag = ""; } } } // check if end root_tag if (tag.matches("/" + "(?i)" + root_tag)) { System.err.println("closing root_tag=" + root_tag); // do something // it never reaches this because tok file ends before. } tag = ""; attribs = "-"; } else { // escaped & < > if (cxml == '&' || (prevxmlchar == '&' && cxml == 'a') || (prevprevxmlchar == ';' && prevxmlchar == ' ' && cxml == 'a')) { cn--; while (((cxml = (char) xmlreader.read()) != (char) -1) && cxml != ';') { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Reading XML escaped char in: " + token); } } } else { throw new Exception(xmlfile + " Distinct chars cxml(" + cxml + ") cpipes(" + cpipes + ")"); } } } } } //System.out.print("paired output: "); for (int i = 0; i < linearr.length - 1; i++) { // There are roles columns in the sentence if (i == last_desc_column) { outfile.write(linearr[i] + field_separator_out + BIO); if (BIO != 'O') { // && !inTag.equals("") outfile.write("-" + inTag); //System.err.println(BIO+"-" + inTag); } if (attribs_re != null) { outfile.write(field_separator_out + inAttribs + field_separator_out); } if (BIO == 'B') { BIO = 'I'; inAttribs = "-"; } } else { outfile.write(linearr[i] + field_separator_out); } } // There arent roles columns in the sentences if (linearr.length - 1 == last_desc_column) { outfile.write(linearr[linearr.length - 1] + field_separator_out + BIO); if (BIO != 'O') { // && !inTag.equals("") outfile.write("-" + inTag); //System.err.println(BIO+"-" + inTag); } if (attribs_re != null) { outfile.write(field_separator_out + inAttribs + field_separator_out); } if (BIO == 'B') { BIO = 'I'; inAttribs = "-"; } } else { outfile.write(linearr[linearr.length - 1]); } if (delayed_closing) { BIO = 'O'; inTag = ""; inAttribs = "-"; findtokenIter = false; interTokenTag = false; delayed_closing = false; } outfile.write("\n"); } else { if (!inTag.equals("")) { throw new Exception(xmlfile + " Broken tag: " + inTag + " at the end of the file/sentence"); } outfile.write(line + "\n"); } } else { outfile.write("\n"); } } } } catch (Exception e) { System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } /*MERGE WITH NEXT FUNCTION, ALSO ADD ANOTHER TO MERGE FROM XML INSTEAD OF PLAIN AND ADD PARAMETERS TO KEEP HEADER/FOTER AND TO REMOVE SPECIFIC ELEMENTS IF NEEDED*/ public String generateXML(String plainmodel, String id_string) { String outputfile; int linen = 0; try { boolean usingNotUTF8tool = false; // make it true if there might be strange chars due to enconding problems outputfile = plainmodel + ".xml"; BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile)); BufferedReader pipesreader = new BufferedReader(new FileReader(this.getFile())); BufferedReader modelreader = new BufferedReader(new FileReader(plainmodel)); try { int sentence = 0, tokn = 0; int tokcolumn = 0; String pipesline; String[] pipesarr = null; char cmodel = '\0'; char cmodel_prev = '\0'; int offset = -1; boolean readmodel = true; /*int token_leading_blanks = 0;int token_leading_tabs = 0;int token_leading_newlines = 0;*/ String leadingBlanksString = ""; while ((pipesline = pipesreader.readLine()) != null) { linen++; //System.err.println(pipesline); if (pipesline.trim().length() > 1) { pipesarr = pipesline.split(this.field_separator_re); String token = pipesarr[tokcolumn]; int token_offset = -1; String paired_token = ""; int cn = 0; while (true) { if (readmodel) { cmodel_prev = cmodel; if ((cmodel = (char) modelreader.read()) == -1) { if (cn == token.length()) { break; // save last token end of file } else { throw new Exception("Premature end of model file"); } } offset++; } else { readmodel = true; } char cpipes = '\0'; if (cn >= token.length()) { if (usingNotUTF8tool) { if (StringUtils.isISO_8859_1(cmodel)) { readmodel = false; break; } else { // delayed token mode for non-ISO desperate cases cpipes = 'a'; } } else { readmodel = false; break; } } else { cpipes = token.charAt(cn); } //System.out.println("offset=" + offset + " cmodel(" + cmodel + ") cpipes(" + cpipes + ")"); if (Character.toLowerCase(cpipes) == Character.toLowerCase(cmodel) || (cmodel == '|' && cpipes == '-')) { if (cmodel == '|') { paired_token += "|"; //scape char for features } else { paired_token += cmodel; } if (token_offset == -1) { token_offset = offset; } // multi-dashes problem ('---' is translated by e.g. Roth to '-') if (usingNotUTF8tool && cmodel == '-' && cn == token.length() - 1) { // read a new char (cmodel) if not end of file to check multi-dash if (!((cmodel = (char) modelreader.read()) == -1)) { readmodel = false; offset++; if (cmodel == '-') { cn--; } //if (cmodel == ' ' || cmodel == '\n' || cmodel == '\r' || cmodel == '\t') { //cn++; //readmodel = true; //} } } else { readmodel = true; } //readmodel = true; } else { if (cmodel == ' ' || cmodel == '\t' || cmodel == '\n' || cmodel == '\r') { cn--; /* DEPRECATED: if ((cmodel == ' ' || cmodel == '\t') && token_offset == -1) {token_leading_blanks++;}if (cmodel == '\n' && token_offset == -1) {token_leading_newlines++; }*/ if (token_offset == -1 && paired_token.equals("")) { if (cmodel == ' ') { leadingBlanksString += "s"; } else { if (cmodel == '\t') { leadingBlanksString += "t"; } else { if (cmodel == '\n') { leadingBlanksString += "n"; } else { if (Character.toLowerCase(cmodel) == '\r') { if ((cmodel = (char) modelreader.read()) != (char) -1) { offset++; if (Character.toLowerCase(cmodel) != '\n') { throw new Exception("End of pipesline not found (rn) " + "offset=" + offset + ". cmodel(" + cmodel + ") found instead."); } else { //DEPRECATED: token_leading_newlines++; leadingBlanksString += "n"; } } } } } } } else { // if (cmodel == ' ') {paired_token += " ";} // No please throw new Exception("A space, tab, or newline in the middle of the token cannot be paired, use UTF-8 NLP tools."); } } else { // Special for quotes (Roth translates " to `` or '') if (usingNotUTF8tool && (cmodel == '"' && ((cpipes == '`') || (cpipes == '\'')))) { if (cn + 1 < token.length() && cpipes == token.charAt(cn + 1)) { cn += 2; paired_token += cmodel; } } else { // Special for quotes2 (Roth sudenly changes '' by ``) if (usingNotUTF8tool && ((cmodel == '\'' || cmodel == '`') && (cpipes == '`' || cpipes == '\''))) { paired_token += cmodel; } else { // multi-dashes problem ('---' is translated by e.g. Roth to '-') if (usingNotUTF8tool && cmodel == '-' && cmodel_prev == '-') { paired_token += cmodel; readmodel = true; cn--; } else { // special for ISO NLP tools if (usingNotUTF8tool && !StringUtils.isISO_8859_1(cmodel)) { paired_token += cmodel; readmodel = true; cn--; } else { throw new Exception("Distinct chars " + paired_token + " offset=" + offset + " cmodel(" + cmodel + ") cpipes(" + cpipes + ")"); } } } } } } cn++; } // DEPRECATED: outfile.write(filename + "|" + sentence + "|" + tokn + "-" + token_leading_blanks + "-" + token_leading_newlines + "|" + paired_token); outfile.write(sentence + "|" + tokn + "-" + leadingBlanksString + "|" + paired_token); for (int i = 1; i < pipesarr.length; i++) { outfile.write("|" + pipesarr[i]); } outfile.write("\n"); //DEPRECATED: token_leading_blanks = 0; token_leading_newlines = 0; leadingBlanksString = ""; tokn++; } else { // newline new sentence // DEPRECATED: outfile.write(pipesline + "\n"); // ommit this because of sentences tokn = 0; sentence++; } } } finally { pipesreader.close(); modelreader.close(); outfile.close(); } } catch (Exception e) { System.err.println("Errors found (TIMEE):\n\t" + e.toString() + "- line:" + linen + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } // TODO /* TODO MERGE THIS FUNCTION WITH THE PREVIOUS ONE*/ public String pipes2tml(String id_string) { if (id_string == null) { id_string = "id"; } String last_text_blanks = ""; // todo averiguar como hacer String outputfile = null; BufferedWriter outfile = null; int linen = 0; int fake_id = 1; try { int iob2col = this.getLastDescColumn(); int wordcol = 0; BufferedReader pipesreader = new BufferedReader(new FileReader(this.getFile())); String inElem = ""; try { String pipesline; String prev_token = ""; String[] pipesarr = null; String curr_fileid = ""; String curr_sentN = ""; while ((pipesline = pipesreader.readLine()) != null) { linen++; pipesarr = pipesline.split("\\|"); if (!curr_fileid.equals(pipesarr[0])) { if (outfile != null && !curr_fileid.equals("")) { outfile.write(last_text_blanks + "</TEXT>\n\n"); outfile.close(); outfile = null; fake_id = 1; } outputfile = this.getFile().getAbsolutePath() + ".xml"; outfile = new BufferedWriter(new FileWriter(outputfile)); curr_fileid = pipesarr[0]; // initialize file outfile.write("<?xml version=\"1.0\" ?>"); outfile.write("<TEXT>\n"); } if (!curr_sentN.equals(pipesarr[1])) { curr_sentN = pipesarr[1]; //outfile.write("\n"); } String preceding_blanks = ""; if (pipesarr[2].matches(".*-[stn]*")) { String[] blanksarr = pipesarr[2].split("-"); if (blanksarr.length > 1) { int x = blanksarr[1].length(); for (int i = 0; i < x; i++) { if (blanksarr[1].charAt(i) == 's') { preceding_blanks += " "; } else { if (blanksarr[1].charAt(i) == 'n') { preceding_blanks += "\n"; } else { if (blanksarr[1].charAt(i) == 't') { preceding_blanks += "\t"; } } } } } } if (pipesarr[iob2col].startsWith("B")) { if (!inElem.equals("")) { outfile.write("</" + inElem + ">"); inElem = ""; } inElem = pipesarr[iob2col].substring(2).toUpperCase(); outfile.write(preceding_blanks + "<" + inElem); // fake attribs/id String tmp_id_string = id_string; if (id_string == "1st_letter") { tmp_id_string = inElem.substring(0, 1).toLowerCase() + "id"; } outfile.write(" " + tmp_id_string + "=\"" + fake_id + "\" "); fake_id++; } outfile.write(">"); if (pipesarr[iob2col].startsWith("I")) { if (inElem.equals("")) { throw new Exception("Found I-X element without B-X (" + pipesarr[iob2col] + ")"); } } if (pipesarr[iob2col].equals("O")) { if (!inElem.equals("")) { outfile.write("</" + inElem + ">"); inElem = ""; } } if (!pipesarr[iob2col].startsWith("B")) { outfile.write(preceding_blanks); } outfile.write(pipesarr[wordcol].replaceAll("|", "|").replaceAll("&", "&").replaceAll("<", "<").replaceAll(">", ">")); prev_token = pipesarr[wordcol]; } if (!inElem.equals("")) { outfile.write("</" + inElem + ">"); inElem = ""; } outfile.write(last_text_blanks + "</TEXT>\n\n"); } finally { pipesreader.close(); outfile.close(); } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + " (Reading line " + linen + ")\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return outputfile; } public void correctIOB() { correctIOB(this.getLastDescColumn()); } public void correctIOB(int IOB2column) { try { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.err.println("Correcting IOB..."); } String outputfile = this.getFile().getCanonicalPath() + "-IOB2checked"; try (BufferedReader tokenizedreader = new BufferedReader(new FileReader(this.getFile())); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile))) { String pipesline; String[] pipesarr; String previousIOB2 = "O"; int linen = 0; while ((pipesline = tokenizedreader.readLine()) != null) { linen++; pipesarr = pipesline.split(this.field_separator_re); // CORRECT IF NEEDED if (pipesarr.length > 1 && previousIOB2.equals("O") && pipesarr[IOB2column].startsWith("I-")) { System.err.println("IOB Corrected: I -> B"); int i; for (i = 0; i < (pipesarr.length - 1); i++) { if (i != IOB2column) { outfile.write(pipesarr[i] + this.field_separator_out); } else { outfile.write(pipesarr[i].replaceFirst("I-", "B-") + this.field_separator_out); } } if (i == IOB2column) { outfile.write(pipesarr[i].replaceFirst("I-", "B-")); } else { outfile.write(pipesarr[i]); } outfile.write("\n"); previousIOB2 = "B"; } else { outfile.write(pipesline + "\n"); if (pipesarr.length > 1) { previousIOB2 = pipesarr[IOB2column].substring(0, 1); } else { previousIOB2 = "O"; } } } } finally { File out = new File(outputfile); Files.copy(Paths.get(out.toURI()), Paths.get(this.getFile().toURI()), REPLACE_EXISTING); out.delete(); } } catch (Exception e) { System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } } } }