package com.cognitionis.nlp_lang_models;
import java.io.*;
import java.util.*;
import java.util.Map.*;
import java.util.regex.*;
/**
*
* @author Héctor Llorens
* @since 2011
*
* Language Model: FingerPrint
*
* A language fingerprint is a ranked list of the x most common n-grams (where n normally include from 1 to (3 to 5) grams)
* It is a very simplistic language model, but has reazonably good results in lang detection task
* The tool is called TextCategorizer because it can categorize not only langs but topics, classes,...
*
*/
public class TextCategorizerFingerprint {
/*private class NGramEntryComparator implements Comparator<Entry<String,Integer>> {
// Gives priority to hig scored keys and if equal to longer keys
// Because a larger n-gram gives more information
public int compare(Entry<String,Integer> entry1, Entry<String,Integer> entry2) {
int value_diff=entry2.getValue()-entry1.getValue();
if(value_diff == 0) {
int keylength_diff=entry1.getKey().length()-entry2.getKey().length();
if(keylength_diff == 0) {
return entry1.getKey().compareTo(entry2.getKey());
}
return keylength_diff;
}
return value_diff;
}
}
*/
private class NGramMapComparator implements Comparator {
private Map _data=null;
public NGramMapComparator (Map data){
super();
_data = data;
}
// Gives priority to hig scored keys and if equal to longer keys
// Because a larger n-gram gives more information
public int compare(Object o1, Object o2) {
String k1 = (String) o1;
String k2 = (String) o2;
Integer v1 = (Integer) this._data.get(k1);
Integer v2 = (Integer) this._data.get(k2);
int diff = v2 - v1;
if (diff == 0) {
diff = k2.length() - k1.length();
if (diff == 0) {
diff=k1.compareTo(k2);
}
}
return diff;
}
}
private String category = "unknown";
/**
* Set of NGrams sorted by the number of occurences in the text which was
* used for creating the FingerPrint. We used a Set because the comparator takes
* into account not only the value but the key length then we use a set of entries composed
* by a String (key) and a Integer (Value)
* If the key length is not important then use a simple Map and do a generic sort... (generic comparator)
*
*/
//private TreeSet<Entry<String, Integer>> sorted_entries;
private HashMap<String, Integer> sorted_entries= new HashMap<>();
private HashMap<String, Integer> categoryDistances = new HashMap<>();
private Pattern pattern = Pattern.compile("^_?[^0-9\\?!\\-_/]*_?$");
/**
* creates an empty FingerPrint
*/
public TextCategorizerFingerprint() {
}
/**
* creates a FingerPrint by reading the FingerPrint Language model-file referenced by the
* passed path.
*
* @param file_path
* path to the FingerPrint-file
* @param cat
* category of the FingerPrint-file
*
*/
public TextCategorizerFingerprint(String file_path, String cat) {
try {
BufferedReader reader = new BufferedReader(new FileReader(new File(file_path)));
this.sorted_entries.clear();
try {
String line;
while ((line = reader.readLine()) != null) {
String[] line_arr = line.split("\\s+");
if (line_arr.length > 0) {
if (line_arr.length != 2) {
throw new Exception("Malformed TextCategorizer configuration file.\n\tMust contain one fingerprint file path per line.");
}
this.sorted_entries.put(line_arr[0], new Integer(line_arr[1]));
}
}
} finally {
reader.close();
this.category=cat;
}
} catch (Exception e) {
System.err.println("Errors found ("+this.getClass().getSimpleName()+"):\n\t" + e.toString() + "\n");
if(System.getProperty("DEBUG")!=null && System.getProperty("DEBUG").equalsIgnoreCase("true")){e.printStackTrace(System.err);}
}
}
/**
* gets the position of the NGram passed to method in the FingerPrint. the
* NGrams are in descending order according to the number of occurences in
* the text which was used creating the FingerPrint.
*
* @param key
* the NGram
* @return the position of the NGram in the FingerPrint
*/
/* public int getPosition(String key) {
/*int pos = 1;
int value = this.sorted_entries.first().getValue();
for (Entry<String, Integer> entry : this.sorted_entries) {
if (value != entry.getValue()) {
value = entry.getValue();
pos++;
}
if (entry.getKey().equals(key)) {
return pos;
}
}
return -1;
return this.sorted_entries.get(key);
}*/
/**
* saves the fingerprint to a file named <categoryname>.lm in the execution
* path.
*/
public void save() {
File file = new File(this.getCategory() + ".lm");
try {
if (file.createNewFile()) {
FileOutputStream fos = new FileOutputStream(file);
fos.write(this.toString().getBytes());
fos.close();
}
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace(System.err);
} catch (IOException ioe) {
ioe.printStackTrace(System.err);
}
}
/**
* returns the category of the FingerPrint or "unknown" if the FingerPrint
* wasn't categorized yet.
*
* @return the category of the FingerPrint
*/
public String getCategory() {
return this.category;
}
public HashMap getSorted_entries() {
return this.sorted_entries;
}
/**
* returns the FingerPrint as a String in the FingerPrint file-format
*/
/* public String toString() {
String s = "";
for (Entry<String, Integer> entry : sorted_entries) {
s += entry.getKey() + "\t" + entry.getValue() + "\n";
}
return s;
}*/ //SWITCH TO MAP FORMAT
/**
* creates a FingerPrint by analysing the content of the given file.
*
* @param file file to be analysed
*/
public void create(File file) {
// TODO TODO HANDLE FILE ENCODING...
// TODO TODO NO PASSAR A STRING I TREBALLAR SINO ANAR LLEGINT LINEA A LINEA PER A NO
// CARREGAR LA MEMORIA
char[] data = new char[1024];
String s = "";
int read;
try {
FileReader fr = new FileReader(file);
while ((read = fr.read(data)) != -1) {
s += new String(data, 0, read);
}
fr.close();
this.create(s);
} catch (FileNotFoundException e) {
e.printStackTrace(System.err);
} catch (IOException e) {
e.printStackTrace(System.err);
}
}
public void create(String text) {
HashMap<String, Integer> entries = new HashMap<String, Integer>();
entries=this.computeNGrams(1, 5, text);
if (entries.containsKey("_")) {
int blanksScore = entries.remove("_");
entries.put("_", blanksScore / 2);
}
ArrayList sorted_keys = new ArrayList(entries.keySet());
Collections.sort(sorted_keys, new NGramMapComparator(entries));
this.sorted_entries.clear();
int n=sorted_keys.size();
for(int i=0;i<n;i++){
this.sorted_entries.put((String) sorted_keys.get(i), i+1);
//System.err.println((String) sorted_keys.get(i)+"-"+(i+1)+"-"+entries.get(sorted_keys.get(i)));
}
//System.err.println("val a="+this.sorted_entries.get("a"));
/*for (Entry e : this.sorted_entries.entrySet()) {
System.err.println(e.getKey()+" "+e.getValue());
}*/
}
/**
* adds all NGrams with the passed order occuring in the given text to the
* FingerPrint. For example:
*
* text = "text" ngramMinOrder = 2, ngramMaxOrder = 2
*
* so the NGrams added to the FingerPrint are:
*
* "_t", "te", "ex", "xt", "t_"
*
* all with a score (occurence) of 1
*
* @param ngramMinOrder
* @param ngramMaxOrder
* @param text
*/
private HashMap computeNGrams(int ngramMinOrder, int ngramMaxOrder, String text) {
// MOD TEXTCAT: period and comma are removed because they are very common and do not give informaiton in european languages
text = text.replaceAll("[.,]", "");
String[] tokens = text.split("\\s+"); // Words plus other language symbols (numbers, -, ...)
HashMap<String, Integer> entries = new HashMap<String, Integer>();
// From min to max ngram sizes
for (int order = ngramMinOrder; order <= ngramMaxOrder; ++order) {
// For each token
for (String token : tokens) {
// consider a space (_) before and after the token
token = "_" + token + "_";
// compute each token sub-ngram of current order
for (int i = 0; i < (token.length() - order + 1); i++) {
String ngram = token.substring(i, i + order);
Matcher matcher = pattern.matcher(ngram);
// toma castanya el pattern del copon
// private Pattern pattern = Pattern.compile("^_?[^0-9\\?!\\-_/]*_?$");
if (!matcher.find()) {
continue;
} else if (!entries.containsKey(ngram)) {
entries.put(ngram, 1);
} else {
int score = entries.remove(ngram);
entries.put(ngram, ++score);
}
}
}
}
return entries;
}
/**
* categorizes the FingerPrint by computing the distance to the FingerPrints
* in the passed Collection. the category of the FingerPrint with the lowest
* distance is assigned to this FingerPrint.
*
* @param categories
* @return the distances
*/
public Map<String, Integer> categorize(Collection<TextCategorizerFingerprint> categories) {
int minDistance = Integer.MAX_VALUE;
for (TextCategorizerFingerprint fp : categories) {
int distance = this.getDistance(fp.getSorted_entries());
this.getCategoryDistances().put(fp.getCategory(), distance);
if (distance < minDistance) {
minDistance = distance;
this.category = fp.getCategory();
}
}
return this.getCategoryDistances();
}
public Map<String, Integer> getCategoryDistances() {
return this.categoryDistances;
}
/**
* computes and returns the distance of this FingerPrint HashMap to the FingerPrint HashMap
* passed to the method.
*
* @param category
* the FingerPrint HashMap to be compared to this one
* @return the distance of the passed FingerPrint to this FingerPrint
*/
private int getDistance(HashMap<String,Integer> category) {
int distance = 0;
int count = 0;
for (Map.Entry<String,Integer> entry: this.sorted_entries.entrySet()){
String ngram = entry.getKey();
count++;
if (count > 400) {
break;
}
if (!category.containsKey(ngram)) {
distance += category.size();
continue;
}
distance += Math.abs(this.sorted_entries.get(ngram) - category.get(ngram));
}
return distance;
}
}