package com.cognitionis.nlp_files;
import java.io.*;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
/**
* PhraselistFile consists instances like phrase(one or more
* words)[|[canonical_form]] Example: one|1 or address book|contact_list or
* Monday|TWeekday or Lunes|Monday
*
* IMPORTANT: Order matters! LONGER PHRASES MUST APPEAR FIRST. The regex for the
* phrases is built in order FIFO, if your phraselist contains a shorter phrase
* that contain a longer phrase which appears afterwards the later would never
* be matched.
*
* NOTE: If there is canonical form phrases can also contain | like (a|b)|c ->
* only the last | is considered This saves lines but perhaps to do the inverse
* mapping it would be better to have one phrase per line... There can be
* functions to condense or expand this (as grammars)
*
* @author Héctor Llorens
* @since 2011
*/
public class RegexPhraselistFile extends NLPFile {
private String name;
private Boolean has_canonical;
private Boolean case_sensitive;
private HashMap<String, String> map; // if some other type is needed you can transform it at run-time (dynamic casting is complicated and makes things complicate)
private HashSet<String> keyset; // added for efficiency ONLY. Equivalent to map.keySet();
private String re; // regular expression
private Locale lang;
public RegexPhraselistFile(String filename) {
this(filename,Boolean.FALSE,new Locale("en", "us"));
}
public RegexPhraselistFile(String filename, Boolean casesensitive, Locale locale) {
super(filename);
case_sensitive=casesensitive;
lang=locale;
name="c_"+this.f.getName().substring(0, this.f.getName().lastIndexOf(".")).toLowerCase();
has_canonical = null;
re = "_no_regex_to_match_";
map = new HashMap();
keyset = null;
isWellFormatted(); // good format is mandatory, this loads map<String,String> and re by default
}
@Override
public Boolean isWellFormatted() {
try {
if (super.getFile() == null) {
throw new Exception("No file loaded in NLPFile object");
}
if (!encoding.equalsIgnoreCase("UTF-8") && !encoding.equalsIgnoreCase("ASCII")) {
throw new Exception("\n\tError: Only ASCII/UTF-8 text is allowed. " + this.f.getName() + " is " + encoding + "\n");
}
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(this.f), "UTF-8"))) {
Boolean checked = false;
String line;
int linen = 0;
while ((line = reader.readLine()) != null) {
//line = line.trim(); spaces are important
linen++;
if (line.length() != 0) {
if (!checked) {
//if (line.matches("^[^\\|]+\\|[^\\|]*$")) { ambiguous can contain options
if (line.matches("^.+\\|[^\\|]*$")) {
has_canonical = true;
re = "(" + line.substring(0, line.lastIndexOf("|"));
} else {
has_canonical = false;
re = "(" + line;
}
checked = true;
} else if (has_canonical) {
if (!line.contains("|")) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Expected | since other lines had canonical forms");
} else {
re += "|" + line.substring(0, line.lastIndexOf("|"));
}
} else {
re += "|" + line;
}
if (!has_canonical && line.contains("\\|")) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Canonical (|) not expected since other lines had no canonical forms");
}
}
if (line.length() != 0) {
if (has_canonical) {
String key = line.substring(0, line.lastIndexOf("|"));
if(!case_sensitive)
key=key.toLowerCase(lang);
String value = line.substring(line.lastIndexOf("|") + 1);
if (map.containsKey(key)) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Repeated phrase. Phraselists must not contain repetitions.");
}
// check sub-character matching (second/seconds)
for(String oldkey: map.keySet()){
if(key.contains(oldkey)){
throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Repeated sub-character (" + oldkey + "). Longer phrases must appear first ("+key+").");
}
}
// check sub-phrase matching (longer phrases should appear first)
String[] multitoken = key.trim().split(" "); // trim to avoid matching empty
if (multitoken.length > 1) {
//System.err.println("--------------------testing:" + this.f.getName() + " -- " + key);
for (int i = 0; i < multitoken.length; i++) {
String token = multitoken[i];
//System.err.println("----- " + token + " i=" + i + " ngram=1");
if (!token.equals("^") && !token.equals("$")) {
//System.out.println(this.f.getName() +" trying " + token);
if (map.containsKey(token)) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Repeated sub-phrase (" + token + "). Longer phrases must appear first.");
}
}
for (int j = 1; j < multitoken.length - i; j++) {
token += " " + multitoken[i + j];
//System.err.println("----- " + token + " i=" + i + " ngram=" + (j+1));
if (map.containsKey(token)) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Repeated sub-phrase (" + token + "). Longer phrases must appear first.");
}
}
}
}
if (value.length() != 0) {
map.put(key.trim(), value.trim());
} else {
map.put(key.trim(), key.trim());
}
} else {
if (map.containsKey(line)) {
throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Repeated phrase!! Phraselists must not contain repetitions.");
}
map.put(line, line);
}
}
}
if (checked) {
re += ")";
if(!case_sensitive)
re=re.toLowerCase(lang);
//re=re.replaceAll("\\.", "\\\\."); // this would be a solution to allow dots
keyset = new HashSet<>(map.keySet());
// Check for multi-word ambiguity (partial match): can be done lively since longest first can be allowed
}
}
} catch (Exception e) {
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
} else {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
}
this.isWellFormatted = false;
return false;
}
this.isWellFormatted = true;
return true;
}
@Override
public String toPlain(String filename) {
throw new UnsupportedOperationException("toPlain not applicable to this type of file");
}
public HashMap<String, String> getMap() {
return map;
}
public HashSet<String> keySet() {
return keyset;
}
public String getMapValue(String key) {
return map.get(key);
}
public String getRE() {
return re;
}
public String getName() {
return name;
}
public HashSet<String> intersectPhraselist(HashSet s) {
/*MANUAL METHOD: HashSet<String> contained=new HashSet<>();for(String k: map.keySet()){ if(s.contains(k)){ contained.add(k); } }*/
HashSet<String> intersection = new HashSet<>(keyset); // create a set to do intersecion
intersection.retainAll(s); // java standard for set intersection
return intersection;
}
}