TokenizedFile.java example

Explorer
cognitionis-nlp-libraries-master
- external-tools
  - src
    - main
      - java
        com
        cognitionis
        external_tools
        CRF.java
        CoNLL_scorer.java
        FreeLing.java
        Main.java
        MaltParser.java
        SRL_Roth.java
        SVM.java
        TempEval_scorer.java
        Tokenizer_TreeTagger.java
        TreeTagger.java
        WNInterface.java
- feature-builder
  - src
    - main
      - java
        com
        cognitionis
        feature_builder
        BaseTokenFeatures.java
        CategorizationTE2.java
        Classification.java
        Main.java
        Timen.java
        TimexNormalization.java
- jtimegraph
  - src
    - main
      - java
        com
        cognitionis
        jtimegraph
        Main.java
        gregoriangraph
        GregorianGraph.java
        GregorianPoint.java
        timegraph
        Chain.java
        TimeGraph.java
        TimePoint.java
- knowledgek
  - src
    - main
      - java
        com
        cognitionis
        knowledgek
        Main.java
        NUMEK
        NUMEK.java
        TIMEK
        TIMEK.java
        VerbAttributesK.java
- nlp-files
  - src
    - main
      - java
        com
        cognitionis
        nlp_files
        LengthAlphabeticalComparator.java
        Main.java
        NLPFile.java
        NgramHandler.java
        PhraselistFile.java
        PipesFile.java
        PlainFile.java
        RegexPhraselistFile.java
        Stat.java
        TabFile.java
        TempEvalFiles.java
        TokenizedFile.java
        TokenizedPerSentenceFile.java
        TransduceRulelistFile.java
        TreebankFile.java
        XMLFile.java
        annotation_scorers
        Judgement.java
        Scomp.java
        Score.java
        Scorer.java
        parentical_parsers
        SRLColParser.java
        SyntColParser.java
        SyntColSBarTMPRoleParser.java
- nlp-knowledge
  - src
    - main
      - java
        com
        cognitionis
        nlp_knowledge
        Main.java
        numbers
        Numek.java
        time
        Timek.java
        TimexNormalizer.java
        TimexResolver.java
    - test
      - java
        com
        cognitionis
        nlp_knowledge
        numbers
        NumekTest.java
        time
        TimekTest.java
        TimexNormalizerTest.java
- nlp-lang-models
  - src
    - main
      - java
        com
        cognitionis
        nlp_lang_models
        Main.java
        TextCategorizer.java
        TextCategorizerFingerprint.java
    - test
      - java
        com
        cognitionis
        nlp_lang_models
        TextCategorizerTest.java
- nlp-segmentation
  - src
    - main
      - java
        com
        cognitionis
        nlp_segmentation
        Aligner.java
        Main.java
        SentSplit.java
        Tokenizer_PTB_Rulebased.java
    - test
      - java
        com
        cognitionis
        nlp_segmentation
        TokenizerTest.java
- nlp-taggers
  - src
    - main
      - java
        com
        cognitionis
        nlp_taggers
        Baseline_MostFrequentTag.java
        HMM.java
        Main.java
        Tagger.java
- nlpbt
  - src
    - main
      - java
        com
        cognitionis
        nlpbt
        Main.java
- timeml-basickit
  - src
    - main
      - java
        com
        cognitionis
        timeml_basickit
        Element.java
        Event.java
        Link.java
        Main.java
        TML_file_utils.java
        TimeML.java
        TimeReference.java
        Timex.java
        comparators
        AscINT_eiid_Comparator.java
        AscINT_lid_Comparator.java
        AscStringTimeRefMapComparator.java
        AscStringTimexMapComparator.java
- utils-basickit
  - src
    - main
      - java
        com
        cognitionis
        utils_basickit
        AscStringIntMapComparator.java
        DateUtils.java
        DescStringIntMapComparator.java
        DescStringIntMapEntryListComparator.java
        FileUtils.java
        Main.java
        MapUtils.java
        SAXReader.java
        StringUtils.java
        Xml2PlainHandler.java
        XmlAttribs.java
        statistics
        T_test.java
- wiki-basickit
  - src
    - main
      - java
        com
        cognitionis
        wiki_basickit
        DBpedia_bk.java
        Main.java
        WikiHtml2PlainESHandler.java
        WikiHtml2PlainHandler.java
        Wiki_bk.java
package com.cognitionis.nlp_files;

import com.cognitionis.utils_basickit.StringUtils;
import java.io.*;
import java.nio.file.*;
import static java.nio.file.StandardCopyOption.*;
import java.util.*;

/**
 * TokenizedFile consists of a first column of tokens (index) and numfields
 * optional number of fields separated by field_separator_re Example:
 * Tokenization, Annotation, Features,... the fields in these files are commonly
 * separated by space, tab or pipe (|)
 *
 * @author Héctor Llorens
 * @since 2011
 */
public class TokenizedFile extends NLPFile  {
    
    private int numfields;
    private String field_separator_re;
    private String field_separator_out;

    public TokenizedFile(String filename) {
        this(filename, "\\s+", 0);
    }

    public TokenizedFile(String filename, String separator) {
        this(filename, separator, 0);
    }

    public TokenizedFile(String filename, String separator, int n) {
        super(filename);
        isWellFormatted = false;
        numfields = n;
        field_separator_re = separator;
        field_separator_out = obtainFieldSeparatorOut();
        isWellFormatted(); // obligatory step (gets numfields)
    }

    @Override
    public Boolean isWellFormatted() {
        try {
            if (super.getFile() == null) {
                throw new Exception("No file loaded in NLPFile object");
            }
            try (BufferedReader reader = new BufferedReader(new FileReader(this.f))) {
                String line;
                int linen = 0;
                while ((line = reader.readLine()) != null) {
                    line = line.trim();
                    linen++;
                    if (line.length() != 0 && numfields == 0) {
                        numfields = line.split(field_separator_re).length;
                    }
                    if (line.length() != 0 && numfields != 0) {
                        if (numfields != line.split(field_separator_re).length) {
                            throw new Exception("Line " + linen + " (" + line + "): Different number of fields. Expected: " + numfields + "  -  Found:" + line.split("\\|").length);
                        }
                    }
                }
                if(numfields==0)
                    throw new Exception("Empty file: "+this.f);
            }
        } catch (Exception e) {
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
                System.exit(1);
            } else {
                System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
            }
            this.isWellFormatted = false;
            return false;
        }
        this.isWellFormatted = true;
        return true;
    }


    @Override
    public String toPlain(String filename) {
        throw new UnsupportedOperationException("This will consist of transform tokens to plain text if possible (will look for a column with offset or spacing info). Generated by an alignment"); //To change body of generated methods, choose Tools | Templates.
    }

    
    public int getLastDescColumn() {
        return numfields - 1;
    }

    public int getNumFields() {
        return numfields;
    }

    public String getFieldSeparatorRE() {
        return field_separator_re;
    }

    public String getFieldSeparatorOut() {
        return field_separator_out;
    }

    public String obtainFieldSeparatorOut() {
        field_separator_out = " ";
        if (!field_separator_re.matches("^.*(\\\\s|[+*]).*$")) {
            field_separator_out = field_separator_re;
        }
        return field_separator_out;
    }

    public HashMap<String,Integer> getTokenCount(){
        return getTokenCount(0);
    }
    
    
    public HashMap<String,Integer> getTokenCount(int field){
        HashMap<String,Integer> token_count=new HashMap<>();
        try {
            if(field>this.numfields)
                throw new Exception("token field ("+field+") is bigger than number of fields: "+this.numfields);
            try (BufferedReader reader = new BufferedReader(new FileReader(this.f))) {
                String line;
                while ((line = reader.readLine()) != null) {
                    line = line.trim();
                    if (line.length() != 0) {
                        String token = line.split(field_separator_re)[field];
                        Integer current_token_count=token_count.get(token);
                        if(current_token_count!=null){
                            token_count.put(token, current_token_count +1);
                        }else{
                            token_count.put(token, 1);
                        }
                    }
                }
                if(numfields==0)
                    throw new Exception("Empty file: "+this.f);
            }
        } catch (Exception e) {
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
                System.exit(1);
            } else {
                System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
            }
        }
        
        return token_count;
    }
    
    public String add_IOB_from_XML(String xmlfile, String root_tag, String elements_re, String attribs_re, String mergeattrib) {
        String field_separator_out = " ";
        if (!field_separator_re.matches("^.*(\\\\s|[+*]).*$")) {
            field_separator_out = field_separator_re;
        }
        String outputfile = this.getFile().toString() + "-annotationKey";
        if (elements_re == null) {
            elements_re = ".*";
        }
        if (!elements_re.equals(".*")) {
            outputfile += "-" + elements_re;
        }
        if (attribs_re != null && !attribs_re.equals(".*")) {
            outputfile += "-" + attribs_re.replaceAll("([.]?\\*|[\"=])", "").replace('|', '_');
        }
        if (mergeattrib != null) {
            outputfile += "-" + mergeattrib;
        }

        try {
            if (!this.isWellFormatted) {
                throw new Exception("Malformed file");
            }

            int tokcolumn = 0;
            int last_desc_column = this.getLastDescColumn();
            boolean hasRoot_tag = false;
            char cxml = '\0';
            String line;
            String tag = "", attribs = "-", inTag = "", inAttribs = "-";
            //boolean closingtag = false;
            char BIO = 'O';


            try (BufferedReader tokenizedreader = new BufferedReader(new FileReader(this.f)); BufferedReader xmlreader = new BufferedReader(new FileReader(xmlfile)); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));) {
                // find root tag
                while (true) {
                    if ((cxml = (char) xmlreader.read()) == -1) {
                        throw new Exception("Premature end of model file");
                    }
                    if (cxml == '<') {
                        if ((cxml = (char) xmlreader.read()) == -1) {
                            throw new Exception("Premature end of model file");
                        }
                        do {
                            tag += cxml;
                            if ((cxml = (char) xmlreader.read()) == -1) {
                                throw new Exception("Premature end of model file");
                            }
                        } while (cxml != '>');
                        if (tag.equalsIgnoreCase(root_tag)) {
                            hasRoot_tag = true;
                            break;
                        }
                        tag = "";

                    }
                    //System.err.print(cxml);
                }

                if (!hasRoot_tag) {
                    throw new Exception("Root tag " + root_tag + " not found");
                }

                tag = "";
                cxml = '\0';

                while ((line = tokenizedreader.readLine()) != null) {
                    line = line.trim();
                    if (line.length() != 0) {
                        String[] linearr = line.split(field_separator_re);
                        if (linearr.length >= 1) {
                            //System.out.println(line);
                            String token = linearr[tokcolumn];
                            boolean interTokenTag = false;
                            boolean findtokenIter = false;
                            boolean delayed_closing = false;
                            char prevxmlchar = 'x';
                            char prevprevxmlchar = 'x';
                            for (int cn = 0; cn < token.length(); cn++) {
                                char cpipes = token.charAt(cn);
                                prevprevxmlchar = prevxmlchar;
                                prevxmlchar = cxml;
                                if ((cxml = (char) xmlreader.read()) == -1) {
                                    throw new Exception("Premature end of model file");
                                }
                                //System.err.println("cxml(" + cxml + ") cpipes(" + cpipes + "," + cn + ") "+inTag);
                                if (Character.toLowerCase(cpipes) != Character.toLowerCase(cxml)) {
                                    if (cxml == ' ' || cxml == '\n' || cxml == '\r' || cxml == '\t') {
                                        cn--;
                                        //System.err.println("blank found cn="+cn);
                                    } else {
                                        // tags handling
                                        if (cxml == '<') {
                                            if (cn != 0) {
                                                interTokenTag = true;
                                            }
                                            cn--;
                                            while (((cxml = (char) xmlreader.read()) != (char) -1) && cxml != '>') {
                                                tag += cxml;
                                            }
                                            tag = tag.trim();
                                            if (tag.indexOf(' ') != -1) {
                                                attribs = tag.substring(tag.indexOf(' ') + 1);
                                                tag = tag.substring(0, tag.indexOf(' '));
                                            }

                                            //System.err.println("tag=" + tag + " attribs=" + attribs);
                                            if (tag.matches("(?i)" + elements_re) && !tag.startsWith("/")) {
                                                findtokenIter = true;

                                                //System.err.println("LOOKING opening tag=" + tag + " attribs=" + attribs);
                                                if (interTokenTag) {
                                                    System.err.println(xmlfile + " Inter-token (" + cn + ") tag consider manual tokenizing: " + token);
                                                }
                                                if (!inTag.equals("")) {
                                                    throw new Exception(xmlfile + " Nested tags (" + tag + "/" + inTag + ") consider manual correction " + token);
                                                }

                                                inTag = tag;
                                                inAttribs = attribs;
                                                tag = "";
                                                attribs = "-";
                                                BIO = 'B';

                                                if (attribs_re != null && !inAttribs.matches("(?i)" + attribs_re)) {
                                                    BIO = 'O';
                                                    inTag = "";
                                                    inAttribs = "-";
                                                    findtokenIter = false;
                                                    interTokenTag = false;
                                                }

                                                if (mergeattrib != null) {
                                                    String tmpattrib = inAttribs.substring(inAttribs.indexOf(mergeattrib + "=")).substring(mergeattrib.length() + 1);
                                                    tmpattrib = tmpattrib.replace("\"", "");
                                                    if (tmpattrib.indexOf(' ') != -1) {
                                                        tmpattrib = tmpattrib.substring(0, tmpattrib.indexOf(' '));
                                                    }
                                                    inTag = inTag + "+" + tmpattrib;
                                                }

                                                /*
                                                 if (inTag.equals("EVENT")) {
                                                 inAttribs = inAttribs.substring(inAttribs.indexOf("class=")).substring(6);
                                                 inAttribs = inAttribs.replace("\"", "");
                                                 if (inAttribs.indexOf(' ') != -1) {
                                                 inAttribs = inAttribs.substring(0, inAttribs.indexOf(' '));
                                                 }
                                                 }*/

                                            } else {
                                                interTokenTag = false;
                                                /*if (tag.contains("TIMEX3") && !tag.matches("/" + inTag)) {
                                                 System.err.println("problema:" + tag + " intag:" + inTag);
                                                 System.exit(1);
                                                 }*/
                                            }

                                            // check if closing                                       
                                            if (tag.matches("/.*")) {
                                                String check = inTag;
                                                if (mergeattrib != null && inTag.matches(".+\\+.+")) {
                                                    check = inTag.substring(0, inTag.indexOf('+'));
                                                }
                                                if (tag.matches("/" + "(?i)" + check)) {
                                                    if (findtokenIter) {
                                                        // safe for empty tags (events_4_instances and timex3_4_durations)
                                                        if (cn >= 0) {
                                                            System.err.println(xmlfile + " Inter Token end of tag (" + inTag + ") cn=" + cn + " " + line);
                                                            delayed_closing = true;
                                                        } else {
                                                            BIO = 'O';
                                                            inTag = "";
                                                            inAttribs = "-";
                                                            findtokenIter = false;
                                                            interTokenTag = false;
                                                        }
                                                    } else {
                                                        //System.err.println("closing tag=" + inTag);
                                                        BIO = 'O';
                                                        inTag = "";
                                                    }

                                                }
                                            }

                                            // check if end root_tag
                                            if (tag.matches("/" + "(?i)" + root_tag)) {
                                                System.err.println("closing root_tag=" + root_tag);
                                                // do something
                                                // it never reaches this because tok file ends before.
                                            }
                                            tag = "";
                                            attribs = "-";

                                        } else {
                                            // escaped & < >
                                            if (cxml == '&' || (prevxmlchar == '&' && cxml == 'a') || (prevprevxmlchar == ';' && prevxmlchar == ' ' && cxml == 'a')) {
                                                cn--;
                                                while (((cxml = (char) xmlreader.read()) != (char) -1) && cxml != ';') {
                                                    if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                                                        System.err.println("Reading XML escaped char in: " + token);
                                                    }
                                                }
                                            } else {
                                                throw new Exception(xmlfile + " Distinct chars cxml(" + cxml + ") cpipes(" + cpipes + ")");
                                            }
                                        }

                                    }
                                }
                            }

                            //System.out.print("paired output: ");


                            for (int i = 0; i < linearr.length - 1; i++) {
                                // There are roles columns in the sentence
                                if (i == last_desc_column) {
                                    outfile.write(linearr[i] + field_separator_out + BIO);
                                    if (BIO != 'O') { // && !inTag.equals("")
                                        outfile.write("-" + inTag);
                                        //System.err.println(BIO+"-" + inTag);
                                    }
                                    if (attribs_re != null) {
                                        outfile.write(field_separator_out + inAttribs + field_separator_out);
                                    }
                                    if (BIO == 'B') {
                                        BIO = 'I';
                                        inAttribs = "-";
                                    }
                                } else {
                                    outfile.write(linearr[i] + field_separator_out);
                                }
                            }

                            // There arent roles columns in the sentences
                            if (linearr.length - 1 == last_desc_column) {
                                outfile.write(linearr[linearr.length - 1] + field_separator_out + BIO);
                                if (BIO != 'O') { // && !inTag.equals("")
                                    outfile.write("-" + inTag);
                                    //System.err.println(BIO+"-" + inTag);
                                }
                                if (attribs_re != null) {
                                    outfile.write(field_separator_out + inAttribs + field_separator_out);
                                }
                                if (BIO == 'B') {
                                    BIO = 'I';
                                    inAttribs = "-";
                                }
                            } else {
                                outfile.write(linearr[linearr.length - 1]);
                            }
                            if (delayed_closing) {
                                BIO = 'O';
                                inTag = "";
                                inAttribs = "-";
                                findtokenIter = false;
                                interTokenTag = false;
                                delayed_closing = false;
                            }
                            outfile.write("\n");
                        } else {
                            if (!inTag.equals("")) {
                                throw new Exception(xmlfile + " Broken tag: " + inTag + " at the end of the file/sentence");
                            }
                            outfile.write(line + "\n");
                        }
                    } else {
                        outfile.write("\n");
                    }
                }

            }



        } catch (Exception e) {
            System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
                System.exit(1);
            }
            return null;
        }
        return outputfile;
    }

    /*MERGE WITH NEXT FUNCTION, ALSO ADD ANOTHER TO MERGE FROM XML INSTEAD OF PLAIN AND ADD PARAMETERS TO KEEP HEADER/FOTER AND TO REMOVE SPECIFIC ELEMENTS IF NEEDED*/
    public String generateXML(String plainmodel, String id_string) {
        String outputfile;
        int linen = 0;
        try {
            boolean usingNotUTF8tool = false; // make it true if there might be strange chars due to enconding problems
            outputfile = plainmodel + ".xml";
            BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile));
            BufferedReader pipesreader = new BufferedReader(new FileReader(this.getFile()));
            BufferedReader modelreader = new BufferedReader(new FileReader(plainmodel));

            try {
                int sentence = 0, tokn = 0;
                int tokcolumn = 0;
                String pipesline;
                String[] pipesarr = null;
                char cmodel = '\0';
                char cmodel_prev = '\0';
                int offset = -1;
                boolean readmodel = true;

                /*int token_leading_blanks = 0;int token_leading_tabs = 0;int token_leading_newlines = 0;*/
                String leadingBlanksString = "";

                while ((pipesline = pipesreader.readLine()) != null) {
                    linen++;
                    //System.err.println(pipesline);
                    if (pipesline.trim().length() > 1) {


                        pipesarr = pipesline.split(this.field_separator_re);
                        String token = pipesarr[tokcolumn];
                        int token_offset = -1;
                        String paired_token = "";

                        int cn = 0;
                        while (true) {
                            if (readmodel) {
                                cmodel_prev = cmodel;
                                if ((cmodel = (char) modelreader.read()) == -1) {
                                    if (cn == token.length()) {
                                        break; // save last token end of file
                                    } else {
                                        throw new Exception("Premature end of model file");
                                    }
                                }
                                offset++;
                            } else {
                                readmodel = true;
                            }

                            char cpipes = '\0';
                            if (cn >= token.length()) {
                                if (usingNotUTF8tool) {
                                    if (StringUtils.isISO_8859_1(cmodel)) {
                                        readmodel = false;
                                        break;
                                    } else { // delayed token mode for non-ISO desperate cases
                                        cpipes = 'a';
                                    }
                                } else {
                                    readmodel = false;
                                    break;
                                }
                            } else {
                                cpipes = token.charAt(cn);
                            }


                            //System.out.println("offset=" + offset + " cmodel(" + cmodel + ") cpipes(" + cpipes + ")");
                            if (Character.toLowerCase(cpipes) == Character.toLowerCase(cmodel) || (cmodel == '|' && cpipes == '-')) {
                                if (cmodel == '|') {
                                    paired_token += "|"; //scape char for features
                                } else {
                                    paired_token += cmodel;
                                }
                                if (token_offset == -1) {
                                    token_offset = offset;
                                }

                                // multi-dashes problem ('---' is translated by e.g. Roth to '-')
                                if (usingNotUTF8tool && cmodel == '-' && cn == token.length() - 1) {
                                    // read a new char (cmodel) if not end of file to check multi-dash
                                    if (!((cmodel = (char) modelreader.read()) == -1)) {
                                        readmodel = false;
                                        offset++;
                                        if (cmodel == '-') {
                                            cn--;
                                        }
                                        //if (cmodel == ' ' || cmodel == '\n' || cmodel == '\r' || cmodel == '\t') {
                                        //cn++;
                                        //readmodel = true;
                                        //}
                                    }
                                } else {
                                    readmodel = true;
                                }
                                //readmodel = true;

                            } else {
                                if (cmodel == ' ' || cmodel == '\t' || cmodel == '\n' || cmodel == '\r') {
                                    cn--;
                                    /* DEPRECATED: if ((cmodel == ' ' || cmodel == '\t') && token_offset == -1) {token_leading_blanks++;}if (cmodel == '\n' && token_offset == -1) {token_leading_newlines++;                                        }*/
                                    if (token_offset == -1 && paired_token.equals("")) {
                                        if (cmodel == ' ') {
                                            leadingBlanksString += "s";
                                        } else {
                                            if (cmodel == '\t') {
                                                leadingBlanksString += "t";
                                            } else {
                                                if (cmodel == '\n') {
                                                    leadingBlanksString += "n";
                                                } else {
                                                    if (Character.toLowerCase(cmodel) == '\r') {
                                                        if ((cmodel = (char) modelreader.read()) != (char) -1) {
                                                            offset++;
                                                            if (Character.toLowerCase(cmodel) != '\n') {
                                                                throw new Exception("End of pipesline not found (rn) " + "offset=" + offset + ". cmodel(" + cmodel + ") found instead.");
                                                            } else {
                                                                //DEPRECATED: token_leading_newlines++;
                                                                leadingBlanksString += "n";
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    } else {
                                        // if (cmodel == ' ') {paired_token += " ";} // No please
                                        throw new Exception("A space, tab, or newline in the middle of the token cannot be paired, use UTF-8 NLP tools.");
                                    }
                                } else {
                                    // Special for quotes (Roth translates " to `` or '')
                                    if (usingNotUTF8tool && (cmodel == '"' && ((cpipes == '`') || (cpipes == '\'')))) {
                                        if (cn + 1 < token.length() && cpipes == token.charAt(cn + 1)) {
                                            cn += 2;
                                            paired_token += cmodel;
                                        }
                                    } else {
                                        // Special for quotes2 (Roth sudenly changes '' by ``)
                                        if (usingNotUTF8tool && ((cmodel == '\'' || cmodel == '`') && (cpipes == '`' || cpipes == '\''))) {
                                            paired_token += cmodel;
                                        } else {
                                            // multi-dashes problem ('---' is translated by e.g. Roth to '-')
                                            if (usingNotUTF8tool && cmodel == '-' && cmodel_prev == '-') {
                                                paired_token += cmodel;
                                                readmodel = true;
                                                cn--;
                                            } else {
                                                // special for ISO NLP tools
                                                if (usingNotUTF8tool && !StringUtils.isISO_8859_1(cmodel)) {
                                                    paired_token += cmodel;
                                                    readmodel = true;
                                                    cn--;
                                                } else {
                                                    throw new Exception("Distinct chars " + paired_token + " offset=" + offset + " cmodel(" + cmodel + ") cpipes(" + cpipes + ")");
                                                }
                                            }
                                        }

                                    }
                                }
                            }
                            cn++;
                        }
                        // DEPRECATED: outfile.write(filename + "|" + sentence + "|" + tokn + "-" + token_leading_blanks + "-" + token_leading_newlines + "|" + paired_token);
                        outfile.write(sentence + "|" + tokn + "-" + leadingBlanksString + "|" + paired_token);
                        for (int i = 1; i < pipesarr.length; i++) {
                            outfile.write("|" + pipesarr[i]);
                        }
                        outfile.write("\n");
                        //DEPRECATED: token_leading_blanks = 0;  token_leading_newlines = 0;
                        leadingBlanksString = "";
                        tokn++;
                    } else { // newline new sentence
                        // DEPRECATED: outfile.write(pipesline + "\n"); // ommit this because of sentences
                        tokn = 0;
                        sentence++;
                    }
                }
            } finally {
                pipesreader.close();
                modelreader.close();
                outfile.close();
            }

        } catch (Exception e) {
            System.err.println("Errors found (TIMEE):\n\t" + e.toString() + "- line:" + linen + "\n");
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
                System.exit(1);
            }
            return null;
        }
        return outputfile;
    }

    // TODO
    /* TODO MERGE THIS FUNCTION WITH THE PREVIOUS ONE*/
    public String pipes2tml(String id_string) {
        if (id_string == null) {
            id_string = "id";
        }
        String last_text_blanks = ""; // todo averiguar como hacer
        String outputfile = null;
        BufferedWriter outfile = null;
        int linen = 0;
        int fake_id = 1;

        try {
            int iob2col = this.getLastDescColumn();
            int wordcol = 0;
            BufferedReader pipesreader = new BufferedReader(new FileReader(this.getFile()));
            String inElem = "";

            try {
                String pipesline;
                String prev_token = "";
                String[] pipesarr = null;
                String curr_fileid = "";
                String curr_sentN = "";

                while ((pipesline = pipesreader.readLine()) != null) {
                    linen++;
                    pipesarr = pipesline.split("\\|");

                    if (!curr_fileid.equals(pipesarr[0])) {
                        if (outfile != null && !curr_fileid.equals("")) {
                            outfile.write(last_text_blanks + "</TEXT>\n\n");

                            outfile.close();
                            outfile = null;
                            fake_id = 1;
                        }

                        outputfile = this.getFile().getAbsolutePath() + ".xml";
                        outfile = new BufferedWriter(new FileWriter(outputfile));


                        curr_fileid = pipesarr[0]; // initialize file
                        outfile.write("<?xml version=\"1.0\" ?>");
                        outfile.write("<TEXT>\n");

                    }
                    if (!curr_sentN.equals(pipesarr[1])) {
                        curr_sentN = pipesarr[1];
                        //outfile.write("\n");
                    }
                    String preceding_blanks = "";
                    if (pipesarr[2].matches(".*-[stn]*")) {
                        String[] blanksarr = pipesarr[2].split("-");
                        if (blanksarr.length > 1) {
                            int x = blanksarr[1].length();
                            for (int i = 0; i < x; i++) {
                                if (blanksarr[1].charAt(i) == 's') {
                                    preceding_blanks += " ";
                                } else {
                                    if (blanksarr[1].charAt(i) == 'n') {
                                        preceding_blanks += "\n";
                                    } else {
                                        if (blanksarr[1].charAt(i) == 't') {
                                            preceding_blanks += "\t";
                                        }
                                    }
                                }
                            }
                        }
                    }
                    if (pipesarr[iob2col].startsWith("B")) {
                        if (!inElem.equals("")) {
                            outfile.write("</" + inElem + ">");
                            inElem = "";
                        }
                        inElem = pipesarr[iob2col].substring(2).toUpperCase();
                        outfile.write(preceding_blanks + "<" + inElem);
                        // fake attribs/id
                        String tmp_id_string = id_string;
                        if (id_string == "1st_letter") {
                            tmp_id_string = inElem.substring(0, 1).toLowerCase() + "id";
                        }
                        outfile.write(" " + tmp_id_string + "=\"" + fake_id + "\" ");
                        fake_id++;
                    }
                    outfile.write(">");
                    if (pipesarr[iob2col].startsWith("I")) {
                        if (inElem.equals("")) {
                            throw new Exception("Found I-X element without B-X (" + pipesarr[iob2col] + ")");
                        }
                    }
                    if (pipesarr[iob2col].equals("O")) {
                        if (!inElem.equals("")) {
                            outfile.write("</" + inElem + ">");
                            inElem = "";
                        }
                    }
                    if (!pipesarr[iob2col].startsWith("B")) {
                        outfile.write(preceding_blanks);
                    }
                    outfile.write(pipesarr[wordcol].replaceAll("|", "|").replaceAll("&", "&").replaceAll("<", "<").replaceAll(">", ">"));

                    prev_token = pipesarr[wordcol];
                }

                if (!inElem.equals("")) {
                    outfile.write("</" + inElem + ">");
                    inElem = "";
                }

                outfile.write(last_text_blanks + "</TEXT>\n\n");

            } finally {
                pipesreader.close();
                outfile.close();
            }



        } catch (Exception e) {
            System.err.println("Errors found (TempEval):\n\t" + e.toString() + " (Reading line " + linen + ")\n");
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
                System.exit(1);
            }
            return null;
        }
        return outputfile;
    }

    public void correctIOB() {
        correctIOB(this.getLastDescColumn());
    }

    public void correctIOB(int IOB2column) {
        try {
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                System.err.println("Correcting IOB...");
            }
            String outputfile = this.getFile().getCanonicalPath() + "-IOB2checked";

            try (BufferedReader tokenizedreader = new BufferedReader(new FileReader(this.getFile())); BufferedWriter outfile = new BufferedWriter(new FileWriter(outputfile))) {
                String pipesline;
                String[] pipesarr;
                String previousIOB2 = "O";
                int linen = 0;
                while ((pipesline = tokenizedreader.readLine()) != null) {
                    linen++;
                    pipesarr = pipesline.split(this.field_separator_re);
                    // CORRECT IF NEEDED
                    if (pipesarr.length > 1 && previousIOB2.equals("O") && pipesarr[IOB2column].startsWith("I-")) {
                        System.err.println("IOB Corrected: I -> B");
                        int i;
                        for (i = 0; i < (pipesarr.length - 1); i++) {
                            if (i != IOB2column) {
                                outfile.write(pipesarr[i] + this.field_separator_out);
                            } else {
                                outfile.write(pipesarr[i].replaceFirst("I-", "B-") + this.field_separator_out);
                            }
                        }
                        if (i == IOB2column) {
                            outfile.write(pipesarr[i].replaceFirst("I-", "B-"));
                        } else {
                            outfile.write(pipesarr[i]);
                        }
                        outfile.write("\n");
                        previousIOB2 = "B";
                    } else {
                        outfile.write(pipesline + "\n");
                        if (pipesarr.length > 1) {
                            previousIOB2 = pipesarr[IOB2column].substring(0, 1);
                        } else {
                            previousIOB2 = "O";
                        }
                    }
                }
            } finally {
                File out = new File(outputfile);
                Files.copy(Paths.get(out.toURI()), Paths.get(this.getFile().toURI()), REPLACE_EXISTING);
                out.delete();
            }
        } catch (Exception e) {
            System.err.println("Errors found (TempEval):\n\t" + e.toString() + "\n");
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
                System.exit(1);
            }
        }
    }
}