Tokenizer_PTB_Rulebased.java example

Explorer

cognitionis-nlp-libraries-master
- external-tools
  - src
    - main
      - java
        com
        cognitionis
        external_tools
        CRF.java
        CoNLL_scorer.java
        FreeLing.java
        Main.java
        MaltParser.java
        SRL_Roth.java
        SVM.java
        TempEval_scorer.java
        Tokenizer_TreeTagger.java
        TreeTagger.java
        WNInterface.java
- feature-builder
  - src
    - main
      - java
        com
        cognitionis
        feature_builder
        BaseTokenFeatures.java
        CategorizationTE2.java
        Classification.java
        Main.java
        Timen.java
        TimexNormalization.java
- jtimegraph
  - src
    - main
      - java
        com
        cognitionis
        jtimegraph
        Main.java
        gregoriangraph
        GregorianGraph.java
        GregorianPoint.java
        timegraph
        Chain.java
        TimeGraph.java
        TimePoint.java
- knowledgek
  - src
    - main
      - java
        com
        cognitionis
        knowledgek
        Main.java
        NUMEK
        NUMEK.java
        TIMEK
        TIMEK.java
        VerbAttributesK.java
- nlp-files
  - src
    - main
      - java
        com
        cognitionis
        nlp_files
        LengthAlphabeticalComparator.java
        Main.java
        NLPFile.java
        NgramHandler.java
        PhraselistFile.java
        PipesFile.java
        PlainFile.java
        RegexPhraselistFile.java
        Stat.java
        TabFile.java
        TempEvalFiles.java
        TokenizedFile.java
        TokenizedPerSentenceFile.java
        TransduceRulelistFile.java
        TreebankFile.java
        XMLFile.java
        annotation_scorers
        Judgement.java
        Scomp.java
        Score.java
        Scorer.java
        parentical_parsers
        SRLColParser.java
        SyntColParser.java
        SyntColSBarTMPRoleParser.java
- nlp-knowledge
  - src
    - main
      - java
        com
        cognitionis
        nlp_knowledge
        Main.java
        numbers
        Numek.java
        time
        Timek.java
        TimexNormalizer.java
        TimexResolver.java
    - test
      - java
        com
        cognitionis
        nlp_knowledge
        numbers
        NumekTest.java
        time
        TimekTest.java
        TimexNormalizerTest.java
- nlp-lang-models
  - src
    - main
      - java
        com
        cognitionis
        nlp_lang_models
        Main.java
        TextCategorizer.java
        TextCategorizerFingerprint.java
    - test
      - java
        com
        cognitionis
        nlp_lang_models
        TextCategorizerTest.java
- nlp-segmentation
  - src
    - main
      - java
        com
        cognitionis
        nlp_segmentation
        Aligner.java
        Main.java
        SentSplit.java
        Tokenizer_PTB_Rulebased.java
    - test
      - java
        com
        cognitionis
        nlp_segmentation
        TokenizerTest.java
- nlp-taggers
  - src
    - main
      - java
        com
        cognitionis
        nlp_taggers
        Baseline_MostFrequentTag.java
        HMM.java
        Main.java
        Tagger.java
- nlpbt
  - src
    - main
      - java
        com
        cognitionis
        nlpbt
        Main.java
- timeml-basickit
  - src
    - main
      - java
        com
        cognitionis
        timeml_basickit
        Element.java
        Event.java
        Link.java
        Main.java
        TML_file_utils.java
        TimeML.java
        TimeReference.java
        Timex.java
        comparators
        AscINT_eiid_Comparator.java
        AscINT_lid_Comparator.java
        AscStringTimeRefMapComparator.java
        AscStringTimexMapComparator.java
- utils-basickit
  - src
    - main
      - java
        com
        cognitionis
        utils_basickit
        AscStringIntMapComparator.java
        DateUtils.java
        DescStringIntMapComparator.java
        DescStringIntMapEntryListComparator.java
        FileUtils.java
        Main.java
        MapUtils.java
        SAXReader.java
        StringUtils.java
        Xml2PlainHandler.java
        XmlAttribs.java
        statistics
        T_test.java
- wiki-basickit
  - src
    - main
      - java
        com
        cognitionis
        wiki_basickit
        DBpedia_bk.java
        Main.java
        WikiHtml2PlainESHandler.java
        WikiHtml2PlainHandler.java
        Wiki_bk.java

package com.cognitionis.nlp_segmentation;

import com.cognitionis.utils_basickit.FileUtils;
import java.io.*;
import java.util.regex.Pattern;

/**
 * Tokenizer_PTB_Rulebased
 * STRATEGY (TreeBank format):
 * 1) Separate what is always a token
 * 2) Separate what is always a token when followed by space
 * 3) Separate periods except accronyms and abbreviations
 *
 * GOOD EXTRAS:
 * 1) Complement this with an alignment tool to get back the original text (either with space, offset handling or both)
 *
 * KNOWN LIMITATIONS:
 * 1) Period ambiguity is not handled correctly (however this is a known problem often handled at POS tagging step)
 *
 * @author Hector Llorens
 * @since May 20, 2013
 */
public class Tokenizer_PTB_Rulebased {

    // do simple regex sentence splitting by default: replaceAll("^\\.$","\\.\n")
    private Boolean doSentSplit;
    // default abbrevs
    public static final String defauldAbbrevRegex = "(adj|Adm|adv|Ala|apdo|Apdo|Ariz|Ark|Aug|Ave|Bancorp|Bhd|Brig|Bros|Ca|Calif|Capt|Cia|Cía|Cie|Co|CO|Col|Colo|Conn|Corp|CORP|Cos|COS|Dec|Del|dept|Dept|D-Mass|Dr|dr|Drs|ej|etc|Etc|fdo|Feb|Fla|ft|Ft|Ga|Gen|Gob|Gov|Hon|Ill|Inc|INC|Ind|Jan|Jr|Kan|Ky|La|Lt|Ltd|Maj|Mass|Md|Messrs|Mfg|Mich|Minn|Miss|Mo|Mr|Mrs|Ms|Neb|Nev|No|Nos|Nov|num|Num|núm|Oct|Okla|Ont|Ore|Pa|pág|págs|Ph|Prof|Prop|Pty|Rep|Reps|Rev|Sen|Sens|Sept|Sgt|Sr|Sra|Srta|St|Ste|tel|Tel|telef|Telef|Tenn|Tex|Ud|Uds|Va|Va|Vd|Vds|vol|vs|Vs|VS|Vt|Wash|Wis|Wyo)";
    // space regexes
    public static Pattern basicSpacePattern = Pattern.compile("^|$|\\s+", Pattern.MULTILINE);
    // Always separated as tokens (grouped if equal)
    public static Pattern alwaysTokenPattern = Pattern.compile("(\\[+|\\(+|\\{+|\\<+|\\]+|\\)+|\\}+|\\>+|[;]+|[?!]+|[¿¡]+|=+|\\.\\.+|--+|\"|`+|''+)");
    // Only token if followed by space (?= is needed not to replace it afterwards)
    public static Pattern bySpacePattern = Pattern.compile("([-:,]\\s|\\s-(?![\\d,.]))");
    // single quote regex \s' and '\s (except '\d0(s)? or '' (which are already separated)
    public static Pattern singleQuotePattern = Pattern.compile("('\\d+0s?|(?<=\\s)'(?!')|(?<!')'(?=\\s))");
    // abbreviation regexes. NOTE: language dependent, could be file.list per lang, however to simplify we use a GENERIC one: English and Catalan are included (?![t]\\b) this is to avoid separating n't which is OK since never happens in Catalan
    public static Pattern oneWordPrefixPattern = Pattern.compile("\\b([ldnmts]')(?=[a-zA-ZÀ-ÿ0-9])(?![t]\\b)", Pattern.CASE_INSENSITIVE);
    public static Pattern oneWordSuffixPattern = Pattern.compile("('ll|'re|'ve|n't|'[smdnl]|-(?:la|li|lo|ho|hi|me|te|se))\\b", Pattern.CASE_INSENSITIVE);
    // contractions
    public static final String[] twoWordContractions = new String[]{"\\b(can)(not)\\b", "\\b(d')(ye)\\b", "\\b(gim)(me)\\b", "\\b(gon)(na)\\b", "\\b(got)(ta)\\b", "\\b(lem)(me)\\b", "\\b(more)('n)\\b", "\\b(wan)(na)\\b"};
    public static final Pattern[] twoWordContractionPatterns = new Pattern[twoWordContractions.length];

    static {
        for (int i = 0; i < twoWordContractions.length; i++) {
            twoWordContractionPatterns[i] = Pattern.compile(twoWordContractions[i], Pattern.CASE_INSENSITIVE);
        }
    }
    public static final String[] threeWordContractionRegexes = new String[]{"\\b(wha)(dd)(ya)\\b", "\\b(wha)(t)(cha)\\b"};
    public static final Pattern[] threeWordContractionsPatterns = new Pattern[threeWordContractionRegexes.length];

    static {
        for (int i = 0; i < threeWordContractionRegexes.length; i++) {
            threeWordContractionsPatterns[i] = Pattern.compile(threeWordContractionRegexes[i], Pattern.CASE_INSENSITIVE);
        }
    }
    //public static Pattern tAbbreviationPattern = Pattern.compile("('t)(is|was)\\b"); // to slang to be supported
    protected Pattern[] patterns;
    private String abbrevRegex;
    public Pattern periodPattern;

    public Tokenizer_PTB_Rulebased() {
        this(true);
    }

    public Tokenizer_PTB_Rulebased(Boolean sentsplit) {
        doSentSplit = sentsplit;
        String periodRegex = "((?<=(\\d|[a-zA-ZÀ-ÿ]['][a-zA-ZÀ-ÿ][a-zA-ZÀ-ÿ]?))\\.(?=[\\s])|(?<!(\\b[a-zA-ZÀ-ÿ]|" + defauldAbbrevRegex + "|\\.\\.))\\.(?=[\\s]))";
        periodPattern = Pattern.compile(periodRegex, Pattern.MULTILINE);
        patterns = new Pattern[]{bySpacePattern, singleQuotePattern, oneWordSuffixPattern, alwaysTokenPattern, oneWordPrefixPattern};
    }

    public Tokenizer_PTB_Rulebased(Boolean sentsplit, File abbrev) {
        doSentSplit = sentsplit;
        abbrevRegex = "";
        try {
            BufferedReader reader = new BufferedReader(new FileReader(abbrev));
            try {
                String line; // = null; unecessary
                int i = 0;
                while ((line = reader.readLine()) != null) {
                    //abbrevMap.put(line, 1);
                    line = line.trim();
                    if (line.equals("") || line.startsWith("#") || line.startsWith("//")) {
                        continue;
                    }
                    /*if (line.endsWith(".")) {line=line.substring(0, -1); }*/
                    if (i == 0) {
                        abbrevRegex = "(" + line;
                    } else {
                        abbrevRegex += "|" + line;
                    }
                    i++;
                }
                if (!abbrevRegex.equals("")) {
                    abbrevRegex = abbrevRegex.replaceAll("\\.", "") + ")";
                }
            } finally {
                reader.close();     //if (reader != null) { unnecessary
            }
        } catch (Exception e) {
            System.err.println("Errors found (FileUtils):\n\t" + e.toString() + ":" + e.getMessage() + "\n");
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
            }
            System.exit(1);
        }

        String periodRegex = "((?<=(\\d|[a-zA-ZÀ-ÿ]['][a-zA-ZÀ-ÿ][a-zA-ZÀ-ÿ]?))\\.(?=[\\s])|(?<!(\\b[a-zA-ZÀ-ÿ]|" + abbrevRegex + "|\\.\\.))\\.(?=[\\s]))";
        periodPattern = Pattern.compile(periodRegex, Pattern.MULTILINE);
        patterns = new Pattern[]{bySpacePattern, alwaysTokenPattern, singleQuotePattern, oneWordSuffixPattern, oneWordPrefixPattern};
    }

    /**
     * Tokenizes the input text and returns a string corresponding to the tokens
     * as one token per line
     *
     * @param text (String plain text)
     * @return tokens (String)
     */
    public String tokenize(String text) {
        String tokens = "";
        if (text != null && text.length() != 0) {
            text = basicSpacePattern.matcher(text).replaceAll(" ");
            for (Pattern pattern : patterns) {
                text = pattern.matcher(text).replaceAll(" $1 ");
            }
            for (Pattern pattern : twoWordContractionPatterns) {
                text = pattern.matcher(text).replaceAll(" $1 $2");
            }
            for (Pattern pattern : threeWordContractionsPatterns) {
                text = pattern.matcher(text).replaceAll(" $1 $2 $3");
            }
            text = periodPattern.matcher(text).replaceAll(" . ");
            text = basicSpacePattern.matcher(text).replaceAll(" "); // AGAIN to make sure there are no extra spaces
            //Pattern RemoveStartEndSpacePattern = Pattern.compile("^(\\s+)|(\\s+)$", Pattern.MULTILINE);
            //text = RemoveStartEndSpacePattern.matcher(text).replaceAll(""); // clean first and end spaces (trim?)
            text = text.trim();
            if (!text.isEmpty() && !text.equals("")) {
                //tokens = text.toString().split("\\s+");
                tokens = text.toString().replaceAll("\\s+", "\n");
                if (doSentSplit) {
                    tokens += "\n";
                    tokens = tokens.replaceAll("\n([.!?])\n", "\n$1\n\n");
                }
            }
        }
        return tokens;
    }

    /**
     * Tokenize an input file and output tokens in another file or stdout if
     * null.
     *
     * @param in_file
     * @param out_file
     */
    public void tokenize(File in_file, File out_file) throws Exception {
        String encoding = FileUtils.getEncoding(in_file);
        if (!encoding.equalsIgnoreCase("UTF-8") && !encoding.equalsIgnoreCase("ASCII")) {
            throw new Exception("\n\tError: Only ASCII/UTF-8 text is allowed. " + in_file.getName() + " is " + encoding + "\n");
        }
        //try {
        BufferedWriter out;
        if (out_file != null) {
            out = new BufferedWriter(new FileWriter(out_file));
        } else {
            out = new BufferedWriter(new OutputStreamWriter(System.out));
        }
        // read-line-by-line and write line by line
        BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(in_file), "UTF-8"));
        try {
            String line;
            String previous_line = null; // avoid extra new line in the last line ended by sent-separator
            while ((line = in.readLine()) != null) {
                if (previous_line != null) {
                    out.write(tokenize(previous_line));
                }
                previous_line = line;
            }
            String last_line = tokenize(previous_line);
            if (doSentSplit && last_line.endsWith("\n\n")) {
                last_line = last_line.substring(0, last_line.length() - 1);
            }
            out.write(last_line);
        } finally {
            if (out_file == null) {
                out.flush();
            } else {
                out.close();
            }
            in.close();
        }
        /*} catch (IOException ex) {
         Logger.getLogger(Tokenizer_PTB_Rulebased.class.getName()).log(Level.SEVERE, null, ex);
         }*/

    }
    
    /**
     * Tokenizes the input text and returns a string array corresponding to the tokens.
     * Normally the array is printed as one token per line
     *
     * @param text (plain text)
     * @return tokens (String [] array)
     */
    public String[] getTokenTexts(String text) throws FileNotFoundException {
        text = basicSpacePattern.matcher(text).replaceAll(" ");
        for (Pattern pattern : patterns) {           text = pattern.matcher(text).replaceAll(" $1 ");        }
        for (Pattern pattern : twoWordContractionPatterns) {            text = pattern.matcher(text).replaceAll(" $1 $2");        }
        for (Pattern pattern : threeWordContractionsPatterns) {            text = pattern.matcher(text).replaceAll(" $1 $2 $3");        }
        text = periodPattern.matcher(text).replaceAll(" . ");
        text = basicSpacePattern.matcher(text).replaceAll(" "); // AGAIN to make sure there are no extra spaces
        Pattern RemoveStartEndSpacePattern = Pattern.compile("^(\\s+)|(\\s+)$", Pattern.MULTILINE);
        text = RemoveStartEndSpacePattern.matcher(text).replaceAll(""); // clean first and end spaces

        String[] tokens = null;
        if(!text.isEmpty() && !text.equals(""))
            tokens=text.toString().split("\\s+");

        return tokens;
    }    

    /**
     * Tokenize an input filename and output tokens in .tok file. It just
     * creates a file as in_filename+".tok"
     *
     * @param in_filename
     * @return out_filename
     */
    public String tokenize_filename_to_tokfile(String in_filename) throws Exception {
        File in_file = new File(in_filename);
        File out_file = new File(in_filename + ".tok");
        String encoding = FileUtils.getEncoding(in_file);
        if (!encoding.equalsIgnoreCase("UTF-8") && !encoding.equalsIgnoreCase("ASCII")) {
            throw new Exception("\n\tError: Only ASCII/UTF-8 text is allowed. " + in_file.getName() + " is " + encoding + "\n");
        }
        //try {
        BufferedWriter out;
        if (out_file != null) {
            out = new BufferedWriter(new FileWriter(out_file));
        } else {
            out = new BufferedWriter(new OutputStreamWriter(System.out));
        }
        // read-line-by-line and write line by line
        BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(in_file), "UTF-8"));
        try {
            String line;
            String previous_line = null; // avoid extra new line in the last line ended by sent-separator
            while ((line = in.readLine()) != null) {
                if (previous_line != null) {
                    out.write(tokenize(previous_line));
                }
                previous_line = line;
            }
            String last_line = tokenize(previous_line);
            if (doSentSplit && last_line.endsWith("\n\n")) {
                last_line = last_line.substring(0, last_line.length() - 1);
            }
            out.write(last_line);
        } finally {
            if (out_file == null) {
                out.flush();
            } else {
                out.close();
            }
            in.close();
        }
        return in_filename + ".tok";
        /*} catch (IOException ex) {
         Logger.getLogger(Tokenizer_PTB_Rulebased.class.getName()).log(Level.SEVERE, null, ex);
         }*/
    }

    
}