package com.cognitionis.nlp_files;
import java.io.*;
import java.net.JarURLConnection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Pattern;
/**
* PhraselistFile consists instances like phrase(one or more
* words)[|[canonical_form]] Example: one|1 or address book|contact_list or
* Monday|TWeekday or Lunes|Monday
*
* IMPORTANT: Order matters! LONGER PHRASES MUST APPEAR FIRST. The regex for the
* phrases is built in order FIFO, if your phraselist contains a shorter phrase
* that contain a longer phrase which appears afterwards the later would never
* be matched.
*
* NOTE: If there is canonical form phrases can also contain | like (a|b)|c ->
* only the last | is considered This saves lines but perhaps to do the inverse
* mapping it would be better to have one phrase per line... There can be
* functions to condense or expand this (as grammars)
*
* @author Héctor Llorens
* @since 2011
*/
public class PhraselistFile extends NLPFile {
private String name;
private Boolean has_canonical;
private Boolean case_sensitive;
private Boolean require_canonical;
private Boolean allow_regex;
private Boolean unify_multitokens;
private HashMap<String, String> map; // if some other type is needed you can transform it at run-time (dynamic casting is complicated and makes things complicate)
private HashMap<String, String> multitoken_map; // if some other type is needed you can transform it at run-time (dynamic casting is complicated and makes things complicate)
private HashSet<String> keyset; // added for efficiency ONLY. Equivalent to map.keySet();
private String multitoken_re; // regular expression
private String re; // regular expression
private Locale lang;
/**
* Creates a new phraselist from a file. By default: not case-sensitive,
* en-US locale, canonical forms are not required, regex are not allowed,
* and unify multi-tokens false
*
* @param filename
*/
public PhraselistFile(String filename) {
this(filename, Boolean.FALSE, new Locale("en", "us"), false, false, false);
}
/**
* Creates a new phraselist from a file, indicating if it has to be case
* sensitive, and the locale By default: canonical forms are not required,
* and regex are not allowed
*
* @param filename
* @param casesensitive
* @param locale
*/
public PhraselistFile(String filename, Boolean casesensitive, Locale locale) {
this(filename, casesensitive, locale, false, false, false);
}
/**
* Creates a new phraselist from a file, indicating if it has to be case
* sensitive, the locale, if canonical forms are required and if regexes are
* allowed
*
* @param filename
* @param casesensitive
* @param locale
* @param require_canonical
* @param allow_regex
*/
public PhraselistFile(String filename, Boolean casesensitive, Locale locale, Boolean req_canonical, Boolean allow_re, Boolean uni_multitokens) {
super(filename);
case_sensitive = casesensitive;
require_canonical = req_canonical;
unify_multitokens = uni_multitokens;
allow_regex = allow_re;
lang = locale;
name = "c_" + this.f.getName().substring(0, this.f.getName().lastIndexOf(".")).toLowerCase();
has_canonical = null;
re = "_no_regex_to_match_";
multitoken_re = "_no_regex_to_match_";
map = new HashMap();
multitoken_map = new HashMap();
keyset = null;
isWellFormatted(); // good format is mandatory, this loads map<String,String> and re by default
}
@Override
public Boolean isWellFormatted() {
try {
if (super.getFile() == null || url==null) {
throw new Exception("No file loaded in NLPFile object");
}
if (encoding == null || (!encoding.equalsIgnoreCase("UTF-8") && !encoding.equalsIgnoreCase("ASCII"))) {
throw new Exception("\n\tError: Only ASCII/UTF-8 text is allowed. " + this.f.getName() + " is " + encoding + "\n");
}
if (url.getProtocol().equals("file")) {
this.inputstream=new FileInputStream(f);
}
if (url.getProtocol().equals("jar")) {
JarURLConnection connection = (JarURLConnection) url.openConnection();
inputstream = connection.getInputStream();
}
try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputstream, "UTF-8"))) {
Boolean checked = false;
String line;
int linen = 0;
while ((line = reader.readLine()) != null) {
//line = line.trim(); spaces are important
linen++;
if (line.length() != 0) {
String token = line;
if (!checked) {
//if (line.matches("^[^\\|]+\\|[^\\|]*$")) { ambiguous can contain options
if (line.matches("^.+\\|[^\\|]*$")) {
has_canonical = true;
token = line.substring(0, line.lastIndexOf("|"));
if (!token.contains(" ") || unify_multitokens) {
re = "(" + token;
} else {
multitoken_re = "(" + token;
}
} else {
has_canonical = false;
if (require_canonical) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Required canonical form not found.");
}
if (!line.contains(" ") || unify_multitokens) {
re = "(" + token;
} else {
multitoken_re = "(" + token;
}
}
checked = true;
} else {
if (has_canonical && !line.contains("|")) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Expected | since other lines had canonical forms");
}
if (!has_canonical && line.contains("\\|")) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Canonical (|) not expected since other lines had no canonical forms");
}
if (has_canonical) {
token = line.substring(0, line.lastIndexOf("|"));
}
if (!token.contains(" ") || unify_multitokens) {
if (re.equals("_no_regex_to_match_")) {
re = "(" + token;
} else {
re += "|" + token;
}
} else {
if (multitoken_re.equals("_no_regex_to_match_")) {
multitoken_re = "(" + token;
} else {
multitoken_re += "|" + token;
}
}
}
if (has_canonical) {
String value = line.substring(line.lastIndexOf("|") + 1);
if (value.length() == 0) {
value = token; // key| (value omitted case)
}
add_to_map(token, value, linen);
} else {
add_to_map(token, token, linen);
}
}
}
if (checked) {
if (!re.equals("_no_regex_to_match_")) {
re += ")";
}
if (!multitoken_re.equals("_no_regex_to_match_")) {
multitoken_re += ")";
}
if (!case_sensitive) {
re = re.toLowerCase(lang);
multitoken_re = multitoken_re.toLowerCase(lang);
}
if (!allow_regex) {
re = re.replaceAll("\\.", "\\\\\\\\."); // escape points
multitoken_re = multitoken_re.replaceAll("\\.", "\\\\\\\\."); // escape points
} //re=re.replaceAll("\\.", "\\\\."); // this would be a solution to allow dots
keyset = new HashSet<>(map.keySet());
keyset.addAll(multitoken_map.keySet());
// Check for multi-word ambiguity (partial match): can be done lively since longest first can be allowed
}
}
} catch (Exception e) {
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
} else {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
}
this.isWellFormatted = false;
return false;
}
this.isWellFormatted = true;
return true;
}
public void add_to_map(String key, String value, int linen) throws Exception {
if (!case_sensitive) {
key = key.toLowerCase(lang);
}
if (!case_sensitive) {
value = value.toLowerCase(lang);
}
if (!allow_regex) {
if (key.matches(".*[*+?()\\[\\]].*")) {
throw new Exception(this.f.getName() + ".Regex not allowed. Symbols * + \\ () [] are not supported.");
}
} else {
Pattern p = Pattern.compile(key); // this will check if the ( and [ can be parsed (closed, ...)
}
if (map.containsKey(key) || multitoken_map.containsKey(key)) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + key + "): Repeated phrase. Phraselists must not contain repetitions.");
}
// check sub-character matching (second/seconds)
for (String oldkey : map.keySet()) {
if (key.contains(oldkey)) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + key + "): Repeated sub-character (" + oldkey + "). Longer phrases must appear first (" + key + ").");
}
}
for (String oldkey : multitoken_map.keySet()) {
if (key.contains(oldkey)) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + key + "): Repeated sub-character (" + oldkey + "). Longer phrases must appear first (" + key + ").");
}
}
// check sub-phrase matching (longer phrases should appear first)
String[] multitoken = key.trim().split(" "); // trim to avoid matching empty
if (multitoken.length > 1) {
//System.err.println("--------------------testing:" + this.f.getName() + " -- " + key);
for (int i = 0; i < multitoken.length; i++) {
String token = multitoken[i];
//System.err.println("----- " + token + " i=" + i + " ngram=1");
if (!token.equals("^") && !token.equals("$")) {
//System.out.println(this.f.getName() +" trying " + token);
if (map.containsKey(token) || multitoken_map.containsKey(token)) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + key + "): Repeated sub-phrase (" + token + "). Longer phrases must appear first.");
}
}
for (int j = 1; j < multitoken.length - i; j++) {
token += " " + multitoken[i + j];
//System.err.println("----- " + token + " i=" + i + " ngram=" + (j+1));
if (map.containsKey(token) || multitoken_map.containsKey(token)) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + key + "): Repeated sub-phrase (" + token + "). Longer phrases must appear first.");
}
}
}
if (unify_multitokens) {
map.put(key.trim(), value.trim());
} else {
multitoken_map.put(key.trim(), value.trim());
}
} else {
map.put(key.trim(), value.trim());
}
}
public static TreeMap<String, String[]> mergeMaps(TreeMap<String, String[]> base, HashMap<String, String> newmap, String c_name) {
if (base == null) {
base = new TreeMap<>(new LengthAlphabeticalComparator());
}
for (Entry<String, String> e : newmap.entrySet()) {
base.put(e.getKey(), new String[]{e.getValue(), "c_" + c_name});
}
return base;
}
public static String get_re_from_keyset(Set<String> keyset) {
String k_re = "_no_regex_to_match_";
if (keyset != null && keyset.size() != 0) {
k_re = "(";
for (String key : keyset) {
if (k_re.equals("(")) {
k_re += key;
} else {
k_re += "|" + key;
}
}
k_re += ")";
}
return k_re;
}
@Override
public String toPlain(String filename) {
throw new UnsupportedOperationException("toPlain not applicable to this type of file");
}
public HashMap<String, String> getMap() {
return map;
}
public HashSet<String> keySet() {
return keyset;
}
public String getMapValue(String key) {
return map.get(key);
}
public String getRE() {
return re;
}
public HashMap<String, String> getMultiMap() {
return multitoken_map;
}
public String getMultiMapValue(String key) {
return multitoken_map.get(key);
}
public String getMultiRE() {
return multitoken_re;
}
public String getName() {
return name;
}
public HashSet<String> intersectPhraselist(HashSet s) {
/*MANUAL METHOD: HashSet<String> contained=new HashSet<>();for(String k: map.keySet()){ if(s.contains(k)){ contained.add(k); } }*/
HashSet<String> intersection = new HashSet<>(keyset); // create a set to do intersecion
intersection.retainAll(s); // java standard for set intersection
return intersection;
}
}