PhraselistFile.java example

Explorer

cognitionis-nlp-libraries-master
- external-tools
  - src
    - main
      - java
        com
        cognitionis
        external_tools
        CRF.java
        CoNLL_scorer.java
        FreeLing.java
        Main.java
        MaltParser.java
        SRL_Roth.java
        SVM.java
        TempEval_scorer.java
        Tokenizer_TreeTagger.java
        TreeTagger.java
        WNInterface.java
- feature-builder
  - src
    - main
      - java
        com
        cognitionis
        feature_builder
        BaseTokenFeatures.java
        CategorizationTE2.java
        Classification.java
        Main.java
        Timen.java
        TimexNormalization.java
- jtimegraph
  - src
    - main
      - java
        com
        cognitionis
        jtimegraph
        Main.java
        gregoriangraph
        GregorianGraph.java
        GregorianPoint.java
        timegraph
        Chain.java
        TimeGraph.java
        TimePoint.java
- knowledgek
  - src
    - main
      - java
        com
        cognitionis
        knowledgek
        Main.java
        NUMEK
        NUMEK.java
        TIMEK
        TIMEK.java
        VerbAttributesK.java
- nlp-files
  - src
    - main
      - java
        com
        cognitionis
        nlp_files
        LengthAlphabeticalComparator.java
        Main.java
        NLPFile.java
        NgramHandler.java
        PhraselistFile.java
        PipesFile.java
        PlainFile.java
        RegexPhraselistFile.java
        Stat.java
        TabFile.java
        TempEvalFiles.java
        TokenizedFile.java
        TokenizedPerSentenceFile.java
        TransduceRulelistFile.java
        TreebankFile.java
        XMLFile.java
        annotation_scorers
        Judgement.java
        Scomp.java
        Score.java
        Scorer.java
        parentical_parsers
        SRLColParser.java
        SyntColParser.java
        SyntColSBarTMPRoleParser.java
- nlp-knowledge
  - src
    - main
      - java
        com
        cognitionis
        nlp_knowledge
        Main.java
        numbers
        Numek.java
        time
        Timek.java
        TimexNormalizer.java
        TimexResolver.java
    - test
      - java
        com
        cognitionis
        nlp_knowledge
        numbers
        NumekTest.java
        time
        TimekTest.java
        TimexNormalizerTest.java
- nlp-lang-models
  - src
    - main
      - java
        com
        cognitionis
        nlp_lang_models
        Main.java
        TextCategorizer.java
        TextCategorizerFingerprint.java
    - test
      - java
        com
        cognitionis
        nlp_lang_models
        TextCategorizerTest.java
- nlp-segmentation
  - src
    - main
      - java
        com
        cognitionis
        nlp_segmentation
        Aligner.java
        Main.java
        SentSplit.java
        Tokenizer_PTB_Rulebased.java
    - test
      - java
        com
        cognitionis
        nlp_segmentation
        TokenizerTest.java
- nlp-taggers
  - src
    - main
      - java
        com
        cognitionis
        nlp_taggers
        Baseline_MostFrequentTag.java
        HMM.java
        Main.java
        Tagger.java
- nlpbt
  - src
    - main
      - java
        com
        cognitionis
        nlpbt
        Main.java
- timeml-basickit
  - src
    - main
      - java
        com
        cognitionis
        timeml_basickit
        Element.java
        Event.java
        Link.java
        Main.java
        TML_file_utils.java
        TimeML.java
        TimeReference.java
        Timex.java
        comparators
        AscINT_eiid_Comparator.java
        AscINT_lid_Comparator.java
        AscStringTimeRefMapComparator.java
        AscStringTimexMapComparator.java
- utils-basickit
  - src
    - main
      - java
        com
        cognitionis
        utils_basickit
        AscStringIntMapComparator.java
        DateUtils.java
        DescStringIntMapComparator.java
        DescStringIntMapEntryListComparator.java
        FileUtils.java
        Main.java
        MapUtils.java
        SAXReader.java
        StringUtils.java
        Xml2PlainHandler.java
        XmlAttribs.java
        statistics
        T_test.java
- wiki-basickit
  - src
    - main
      - java
        com
        cognitionis
        wiki_basickit
        DBpedia_bk.java
        Main.java
        WikiHtml2PlainESHandler.java
        WikiHtml2PlainHandler.java
        Wiki_bk.java

package com.cognitionis.nlp_files;

import java.io.*;
import java.net.JarURLConnection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Pattern;

/**
 * PhraselistFile consists instances like phrase(one or more
 * words)[|[canonical_form]] Example: one|1 or address book|contact_list or
 * Monday|TWeekday or Lunes|Monday
 *
 * IMPORTANT: Order matters! LONGER PHRASES MUST APPEAR FIRST. The regex for the
 * phrases is built in order FIFO, if your phraselist contains a shorter phrase
 * that contain a longer phrase which appears afterwards the later would never
 * be matched.
 *
 * NOTE: If there is canonical form phrases can also contain | like (a|b)|c ->
 * only the last | is considered This saves lines but perhaps to do the inverse
 * mapping it would be better to have one phrase per line... There can be
 * functions to condense or expand this (as grammars)
 *
 * @author Héctor Llorens
 * @since 2011
 */
public class PhraselistFile extends NLPFile {

    private String name;
    private Boolean has_canonical;
    private Boolean case_sensitive;
    private Boolean require_canonical;
    private Boolean allow_regex;
    private Boolean unify_multitokens;
    private HashMap<String, String> map; // if some other type is needed you can transform it at run-time (dynamic casting is complicated and makes things complicate)
    private HashMap<String, String> multitoken_map; // if some other type is needed you can transform it at run-time (dynamic casting is complicated and makes things complicate)
    private HashSet<String> keyset; // added for efficiency ONLY. Equivalent to map.keySet();
    private String multitoken_re; // regular expression
    private String re; // regular expression
    private Locale lang;

    /**
     * Creates a new phraselist from a file. By default: not case-sensitive,
     * en-US locale, canonical forms are not required, regex are not allowed,
     * and unify multi-tokens false
     *
     * @param filename
     */
    public PhraselistFile(String filename) {
        this(filename, Boolean.FALSE, new Locale("en", "us"), false, false, false);
    }

    /**
     * Creates a new phraselist from a file, indicating if it has to be case
     * sensitive, and the locale By default: canonical forms are not required,
     * and regex are not allowed
     *
     * @param filename
     * @param casesensitive
     * @param locale
     */
    public PhraselistFile(String filename, Boolean casesensitive, Locale locale) {
        this(filename, casesensitive, locale, false, false, false);
    }

    /**
     * Creates a new phraselist from a file, indicating if it has to be case
     * sensitive, the locale, if canonical forms are required and if regexes are
     * allowed
     *
     * @param filename
     * @param casesensitive
     * @param locale
     * @param require_canonical
     * @param allow_regex
     */
    public PhraselistFile(String filename, Boolean casesensitive, Locale locale, Boolean req_canonical, Boolean allow_re, Boolean uni_multitokens) {
        super(filename);
        case_sensitive = casesensitive;
        require_canonical = req_canonical;
        unify_multitokens = uni_multitokens;
        allow_regex = allow_re;
        lang = locale;
        name = "c_" + this.f.getName().substring(0, this.f.getName().lastIndexOf(".")).toLowerCase();
        has_canonical = null;
        re = "_no_regex_to_match_";
        multitoken_re = "_no_regex_to_match_";
        map = new HashMap();
        multitoken_map = new HashMap();
        keyset = null;
        isWellFormatted(); // good format is mandatory, this loads map<String,String> and re by default
    }

    @Override
    public Boolean isWellFormatted() {
        try {
            if (super.getFile() == null || url==null) {
                throw new Exception("No file loaded in NLPFile object");
            }
            if (encoding == null || (!encoding.equalsIgnoreCase("UTF-8") && !encoding.equalsIgnoreCase("ASCII"))) {
                throw new Exception("\n\tError: Only ASCII/UTF-8 text is allowed. " + this.f.getName() + " is " + encoding + "\n");
            }
            if (url.getProtocol().equals("file")) {
                this.inputstream=new FileInputStream(f);
            }
            if (url.getProtocol().equals("jar")) {
                JarURLConnection connection = (JarURLConnection) url.openConnection();
                inputstream = connection.getInputStream();
            }
            
            try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputstream, "UTF-8"))) {
                Boolean checked = false;
                String line;
                int linen = 0;
                while ((line = reader.readLine()) != null) {
                    //line = line.trim(); spaces are important
                    linen++;
                    if (line.length() != 0) {
                        String token = line;
                        if (!checked) {
                            //if (line.matches("^[^\\|]+\\|[^\\|]*$")) { ambiguous can contain options
                            if (line.matches("^.+\\|[^\\|]*$")) {
                                has_canonical = true;
                                token = line.substring(0, line.lastIndexOf("|"));
                                if (!token.contains(" ") || unify_multitokens) {
                                    re = "(" + token;
                                } else {
                                    multitoken_re = "(" + token;
                                }
                            } else {
                                has_canonical = false;
                                if (require_canonical) {
                                    throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Required canonical form not found.");
                                }
                                if (!line.contains(" ") || unify_multitokens) {
                                    re = "(" + token;
                                } else {
                                    multitoken_re = "(" + token;
                                }
                            }
                            checked = true;
                        } else {
                            if (has_canonical && !line.contains("|")) {
                                throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Expected | since other lines had canonical forms");
                            }

                            if (!has_canonical && line.contains("\\|")) {
                                throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Canonical (|) not expected since other lines had no canonical forms");
                            }
                            if (has_canonical) {
                                token = line.substring(0, line.lastIndexOf("|"));
                            }

                            if (!token.contains(" ") || unify_multitokens) {
                                if (re.equals("_no_regex_to_match_")) {
                                    re = "(" + token;
                                } else {
                                    re += "|" + token;
                                }
                            } else {
                                if (multitoken_re.equals("_no_regex_to_match_")) {
                                    multitoken_re = "(" + token;
                                } else {
                                    multitoken_re += "|" + token;
                                }
                            }
                        }

                        if (has_canonical) {
                            String value = line.substring(line.lastIndexOf("|") + 1);
                            if (value.length() == 0) {
                                value = token; // key|  (value omitted case)
                            }
                            add_to_map(token, value, linen);
                        } else {
                            add_to_map(token, token, linen);
                        }
                    }
                }
                if (checked) {
                    if (!re.equals("_no_regex_to_match_")) {
                        re += ")";
                    }
                    if (!multitoken_re.equals("_no_regex_to_match_")) {
                        multitoken_re += ")";
                    }
                    if (!case_sensitive) {
                        re = re.toLowerCase(lang);
                        multitoken_re = multitoken_re.toLowerCase(lang);
                    }
                    if (!allow_regex) {
                        re = re.replaceAll("\\.", "\\\\\\\\."); // escape points 
                        multitoken_re = multitoken_re.replaceAll("\\.", "\\\\\\\\."); // escape points 
                    }                    //re=re.replaceAll("\\.", "\\\\."); // this would be a solution to allow dots
                    keyset = new HashSet<>(map.keySet());
                    keyset.addAll(multitoken_map.keySet());
                    // Check for multi-word ambiguity (partial match): can be done lively since longest first can be allowed
                }
            }
        } catch (Exception e) {
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
                System.exit(1);
            } else {
                System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
            }
            this.isWellFormatted = false;
            return false;
        }
        this.isWellFormatted = true;
        return true;
    }

    public void add_to_map(String key, String value, int linen) throws Exception {

        if (!case_sensitive) {
            key = key.toLowerCase(lang);
        }
        if (!case_sensitive) {
            value = value.toLowerCase(lang);
        }

        if (!allow_regex) {
            if (key.matches(".*[*+?()\\[\\]].*")) {
                throw new Exception(this.f.getName() + ".Regex not allowed. Symbols * + \\ () [] are not supported.");
            }
        } else {
            Pattern p = Pattern.compile(key); // this will check if the ( and [ can be parsed (closed, ...)
        }



        if (map.containsKey(key) || multitoken_map.containsKey(key)) {
            throw new Exception(this.f.getName() + ". Line " + linen + " (" + key + "): Repeated phrase. Phraselists must not contain repetitions.");
        }
        // check sub-character matching (second/seconds)
        for (String oldkey : map.keySet()) {
            if (key.contains(oldkey)) {
                throw new Exception(this.f.getName() + ". Line " + linen + " (" + key + "): Repeated sub-character (" + oldkey + "). Longer phrases must appear first (" + key + ").");
            }
        }
        for (String oldkey : multitoken_map.keySet()) {
            if (key.contains(oldkey)) {
                throw new Exception(this.f.getName() + ". Line " + linen + " (" + key + "): Repeated sub-character (" + oldkey + "). Longer phrases must appear first (" + key + ").");
            }
        }

        // check sub-phrase matching (longer phrases should appear first)
        String[] multitoken = key.trim().split(" "); // trim to avoid matching empty
        if (multitoken.length > 1) {
            //System.err.println("--------------------testing:" + this.f.getName() + " -- " + key);
            for (int i = 0; i < multitoken.length; i++) {
                String token = multitoken[i];
                //System.err.println("----- " + token + " i=" + i + " ngram=1");
                if (!token.equals("^") && !token.equals("$")) {
                    //System.out.println(this.f.getName() +" trying " + token);
                    if (map.containsKey(token) || multitoken_map.containsKey(token)) {
                        throw new Exception(this.f.getName() + ". Line " + linen + " (" + key + "): Repeated sub-phrase (" + token + "). Longer phrases must appear first.");
                    }
                }
                for (int j = 1; j < multitoken.length - i; j++) {
                    token += " " + multitoken[i + j];
                    //System.err.println("----- " + token + " i=" + i + " ngram=" + (j+1));
                    if (map.containsKey(token) || multitoken_map.containsKey(token)) {
                        throw new Exception(this.f.getName() + ". Line " + linen + " (" + key + "): Repeated sub-phrase (" + token + "). Longer phrases must appear first.");
                    }
                }
            }
            if (unify_multitokens) {
                map.put(key.trim(), value.trim());
            } else {
                multitoken_map.put(key.trim(), value.trim());
            }
        } else {
            map.put(key.trim(), value.trim());
        }
    }

    public static TreeMap<String, String[]> mergeMaps(TreeMap<String, String[]> base, HashMap<String, String> newmap, String c_name) {
        if (base == null) {
            base = new TreeMap<>(new LengthAlphabeticalComparator());
        }
        for (Entry<String, String> e : newmap.entrySet()) {
            base.put(e.getKey(), new String[]{e.getValue(), "c_" + c_name});
        }
        return base;
    }

    public static String get_re_from_keyset(Set<String> keyset) {
        String k_re = "_no_regex_to_match_";
        if (keyset != null && keyset.size() != 0) {
            k_re = "(";
            for (String key : keyset) {
                if (k_re.equals("(")) {
                    k_re += key;
                } else {
                    k_re += "|" + key;
                }
            }
            k_re += ")";
        }
        return k_re;
    }

    @Override
    public String toPlain(String filename) {
        throw new UnsupportedOperationException("toPlain not applicable to this type of file");
    }

    public HashMap<String, String> getMap() {
        return map;
    }

    public HashSet<String> keySet() {
        return keyset;
    }

    public String getMapValue(String key) {
        return map.get(key);
    }

    public String getRE() {
        return re;
    }

    public HashMap<String, String> getMultiMap() {
        return multitoken_map;
    }

    public String getMultiMapValue(String key) {
        return multitoken_map.get(key);
    }

    public String getMultiRE() {
        return multitoken_re;
    }

    public String getName() {
        return name;
    }

    public HashSet<String> intersectPhraselist(HashSet s) {
        /*MANUAL METHOD: HashSet<String> contained=new HashSet<>();for(String k: map.keySet()){ if(s.contains(k)){    contained.add(k);            }        }*/
        HashSet<String> intersection = new HashSet<>(keyset); // create a set to do intersecion
        intersection.retainAll(s); // java standard for set intersection
        return intersection;
    }
}