package com.cognitionis.nlp_segmentation;
import com.cognitionis.utils_basickit.FileUtils;
import java.io.*;
import java.util.regex.Pattern;
/**
* Tokenizer_PTB_Rulebased
* STRATEGY (TreeBank format):
* 1) Separate what is always a token
* 2) Separate what is always a token when followed by space
* 3) Separate periods except accronyms and abbreviations
*
* GOOD EXTRAS:
* 1) Complement this with an alignment tool to get back the original text (either with space, offset handling or both)
*
* KNOWN LIMITATIONS:
* 1) Period ambiguity is not handled correctly (however this is a known problem often handled at POS tagging step)
*
* @author Hector Llorens
* @since May 20, 2013
*/
public class Tokenizer_PTB_Rulebased {
// do simple regex sentence splitting by default: replaceAll("^\\.$","\\.\n")
private Boolean doSentSplit;
// default abbrevs
public static final String defauldAbbrevRegex = "(adj|Adm|adv|Ala|apdo|Apdo|Ariz|Ark|Aug|Ave|Bancorp|Bhd|Brig|Bros|Ca|Calif|Capt|Cia|Cía|Cie|Co|CO|Col|Colo|Conn|Corp|CORP|Cos|COS|Dec|Del|dept|Dept|D-Mass|Dr|dr|Drs|ej|etc|Etc|fdo|Feb|Fla|ft|Ft|Ga|Gen|Gob|Gov|Hon|Ill|Inc|INC|Ind|Jan|Jr|Kan|Ky|La|Lt|Ltd|Maj|Mass|Md|Messrs|Mfg|Mich|Minn|Miss|Mo|Mr|Mrs|Ms|Neb|Nev|No|Nos|Nov|num|Num|núm|Oct|Okla|Ont|Ore|Pa|pág|págs|Ph|Prof|Prop|Pty|Rep|Reps|Rev|Sen|Sens|Sept|Sgt|Sr|Sra|Srta|St|Ste|tel|Tel|telef|Telef|Tenn|Tex|Ud|Uds|Va|Va|Vd|Vds|vol|vs|Vs|VS|Vt|Wash|Wis|Wyo)";
// space regexes
public static Pattern basicSpacePattern = Pattern.compile("^|$|\\s+", Pattern.MULTILINE);
// Always separated as tokens (grouped if equal)
public static Pattern alwaysTokenPattern = Pattern.compile("(\\[+|\\(+|\\{+|\\<+|\\]+|\\)+|\\}+|\\>+|[;]+|[?!]+|[¿¡]+|=+|\\.\\.+|--+|\"|`+|''+)");
// Only token if followed by space (?= is needed not to replace it afterwards)
public static Pattern bySpacePattern = Pattern.compile("([-:,]\\s|\\s-(?![\\d,.]))");
// single quote regex \s' and '\s (except '\d0(s)? or '' (which are already separated)
public static Pattern singleQuotePattern = Pattern.compile("('\\d+0s?|(?<=\\s)'(?!')|(?<!')'(?=\\s))");
// abbreviation regexes. NOTE: language dependent, could be file.list per lang, however to simplify we use a GENERIC one: English and Catalan are included (?![t]\\b) this is to avoid separating n't which is OK since never happens in Catalan
public static Pattern oneWordPrefixPattern = Pattern.compile("\\b([ldnmts]')(?=[a-zA-ZÀ-ÿ0-9])(?![t]\\b)", Pattern.CASE_INSENSITIVE);
public static Pattern oneWordSuffixPattern = Pattern.compile("('ll|'re|'ve|n't|'[smdnl]|-(?:la|li|lo|ho|hi|me|te|se))\\b", Pattern.CASE_INSENSITIVE);
// contractions
public static final String[] twoWordContractions = new String[]{"\\b(can)(not)\\b", "\\b(d')(ye)\\b", "\\b(gim)(me)\\b", "\\b(gon)(na)\\b", "\\b(got)(ta)\\b", "\\b(lem)(me)\\b", "\\b(more)('n)\\b", "\\b(wan)(na)\\b"};
public static final Pattern[] twoWordContractionPatterns = new Pattern[twoWordContractions.length];
static {
for (int i = 0; i < twoWordContractions.length; i++) {
twoWordContractionPatterns[i] = Pattern.compile(twoWordContractions[i], Pattern.CASE_INSENSITIVE);
}
}
public static final String[] threeWordContractionRegexes = new String[]{"\\b(wha)(dd)(ya)\\b", "\\b(wha)(t)(cha)\\b"};
public static final Pattern[] threeWordContractionsPatterns = new Pattern[threeWordContractionRegexes.length];
static {
for (int i = 0; i < threeWordContractionRegexes.length; i++) {
threeWordContractionsPatterns[i] = Pattern.compile(threeWordContractionRegexes[i], Pattern.CASE_INSENSITIVE);
}
}
//public static Pattern tAbbreviationPattern = Pattern.compile("('t)(is|was)\\b"); // to slang to be supported
protected Pattern[] patterns;
private String abbrevRegex;
public Pattern periodPattern;
public Tokenizer_PTB_Rulebased() {
this(true);
}
public Tokenizer_PTB_Rulebased(Boolean sentsplit) {
doSentSplit = sentsplit;
String periodRegex = "((?<=(\\d|[a-zA-ZÀ-ÿ]['][a-zA-ZÀ-ÿ][a-zA-ZÀ-ÿ]?))\\.(?=[\\s])|(?<!(\\b[a-zA-ZÀ-ÿ]|" + defauldAbbrevRegex + "|\\.\\.))\\.(?=[\\s]))";
periodPattern = Pattern.compile(periodRegex, Pattern.MULTILINE);
patterns = new Pattern[]{bySpacePattern, singleQuotePattern, oneWordSuffixPattern, alwaysTokenPattern, oneWordPrefixPattern};
}
public Tokenizer_PTB_Rulebased(Boolean sentsplit, File abbrev) {
doSentSplit = sentsplit;
abbrevRegex = "";
try {
BufferedReader reader = new BufferedReader(new FileReader(abbrev));
try {
String line; // = null; unecessary
int i = 0;
while ((line = reader.readLine()) != null) {
//abbrevMap.put(line, 1);
line = line.trim();
if (line.equals("") || line.startsWith("#") || line.startsWith("//")) {
continue;
}
/*if (line.endsWith(".")) {line=line.substring(0, -1); }*/
if (i == 0) {
abbrevRegex = "(" + line;
} else {
abbrevRegex += "|" + line;
}
i++;
}
if (!abbrevRegex.equals("")) {
abbrevRegex = abbrevRegex.replaceAll("\\.", "") + ")";
}
} finally {
reader.close(); //if (reader != null) { unnecessary
}
} catch (Exception e) {
System.err.println("Errors found (FileUtils):\n\t" + e.toString() + ":" + e.getMessage() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
}
System.exit(1);
}
String periodRegex = "((?<=(\\d|[a-zA-ZÀ-ÿ]['][a-zA-ZÀ-ÿ][a-zA-ZÀ-ÿ]?))\\.(?=[\\s])|(?<!(\\b[a-zA-ZÀ-ÿ]|" + abbrevRegex + "|\\.\\.))\\.(?=[\\s]))";
periodPattern = Pattern.compile(periodRegex, Pattern.MULTILINE);
patterns = new Pattern[]{bySpacePattern, alwaysTokenPattern, singleQuotePattern, oneWordSuffixPattern, oneWordPrefixPattern};
}
/**
* Tokenizes the input text and returns a string corresponding to the tokens
* as one token per line
*
* @param text (String plain text)
* @return tokens (String)
*/
public String tokenize(String text) {
String tokens = "";
if (text != null && text.length() != 0) {
text = basicSpacePattern.matcher(text).replaceAll(" ");
for (Pattern pattern : patterns) {
text = pattern.matcher(text).replaceAll(" $1 ");
}
for (Pattern pattern : twoWordContractionPatterns) {
text = pattern.matcher(text).replaceAll(" $1 $2");
}
for (Pattern pattern : threeWordContractionsPatterns) {
text = pattern.matcher(text).replaceAll(" $1 $2 $3");
}
text = periodPattern.matcher(text).replaceAll(" . ");
text = basicSpacePattern.matcher(text).replaceAll(" "); // AGAIN to make sure there are no extra spaces
//Pattern RemoveStartEndSpacePattern = Pattern.compile("^(\\s+)|(\\s+)$", Pattern.MULTILINE);
//text = RemoveStartEndSpacePattern.matcher(text).replaceAll(""); // clean first and end spaces (trim?)
text = text.trim();
if (!text.isEmpty() && !text.equals("")) {
//tokens = text.toString().split("\\s+");
tokens = text.toString().replaceAll("\\s+", "\n");
if (doSentSplit) {
tokens += "\n";
tokens = tokens.replaceAll("\n([.!?])\n", "\n$1\n\n");
}
}
}
return tokens;
}
/**
* Tokenize an input file and output tokens in another file or stdout if
* null.
*
* @param in_file
* @param out_file
*/
public void tokenize(File in_file, File out_file) throws Exception {
String encoding = FileUtils.getEncoding(in_file);
if (!encoding.equalsIgnoreCase("UTF-8") && !encoding.equalsIgnoreCase("ASCII")) {
throw new Exception("\n\tError: Only ASCII/UTF-8 text is allowed. " + in_file.getName() + " is " + encoding + "\n");
}
//try {
BufferedWriter out;
if (out_file != null) {
out = new BufferedWriter(new FileWriter(out_file));
} else {
out = new BufferedWriter(new OutputStreamWriter(System.out));
}
// read-line-by-line and write line by line
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(in_file), "UTF-8"));
try {
String line;
String previous_line = null; // avoid extra new line in the last line ended by sent-separator
while ((line = in.readLine()) != null) {
if (previous_line != null) {
out.write(tokenize(previous_line));
}
previous_line = line;
}
String last_line = tokenize(previous_line);
if (doSentSplit && last_line.endsWith("\n\n")) {
last_line = last_line.substring(0, last_line.length() - 1);
}
out.write(last_line);
} finally {
if (out_file == null) {
out.flush();
} else {
out.close();
}
in.close();
}
/*} catch (IOException ex) {
Logger.getLogger(Tokenizer_PTB_Rulebased.class.getName()).log(Level.SEVERE, null, ex);
}*/
}
/**
* Tokenizes the input text and returns a string array corresponding to the tokens.
* Normally the array is printed as one token per line
*
* @param text (plain text)
* @return tokens (String [] array)
*/
public String[] getTokenTexts(String text) throws FileNotFoundException {
text = basicSpacePattern.matcher(text).replaceAll(" ");
for (Pattern pattern : patterns) { text = pattern.matcher(text).replaceAll(" $1 "); }
for (Pattern pattern : twoWordContractionPatterns) { text = pattern.matcher(text).replaceAll(" $1 $2"); }
for (Pattern pattern : threeWordContractionsPatterns) { text = pattern.matcher(text).replaceAll(" $1 $2 $3"); }
text = periodPattern.matcher(text).replaceAll(" . ");
text = basicSpacePattern.matcher(text).replaceAll(" "); // AGAIN to make sure there are no extra spaces
Pattern RemoveStartEndSpacePattern = Pattern.compile("^(\\s+)|(\\s+)$", Pattern.MULTILINE);
text = RemoveStartEndSpacePattern.matcher(text).replaceAll(""); // clean first and end spaces
String[] tokens = null;
if(!text.isEmpty() && !text.equals(""))
tokens=text.toString().split("\\s+");
return tokens;
}
/**
* Tokenize an input filename and output tokens in .tok file. It just
* creates a file as in_filename+".tok"
*
* @param in_filename
* @return out_filename
*/
public String tokenize_filename_to_tokfile(String in_filename) throws Exception {
File in_file = new File(in_filename);
File out_file = new File(in_filename + ".tok");
String encoding = FileUtils.getEncoding(in_file);
if (!encoding.equalsIgnoreCase("UTF-8") && !encoding.equalsIgnoreCase("ASCII")) {
throw new Exception("\n\tError: Only ASCII/UTF-8 text is allowed. " + in_file.getName() + " is " + encoding + "\n");
}
//try {
BufferedWriter out;
if (out_file != null) {
out = new BufferedWriter(new FileWriter(out_file));
} else {
out = new BufferedWriter(new OutputStreamWriter(System.out));
}
// read-line-by-line and write line by line
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(in_file), "UTF-8"));
try {
String line;
String previous_line = null; // avoid extra new line in the last line ended by sent-separator
while ((line = in.readLine()) != null) {
if (previous_line != null) {
out.write(tokenize(previous_line));
}
previous_line = line;
}
String last_line = tokenize(previous_line);
if (doSentSplit && last_line.endsWith("\n\n")) {
last_line = last_line.substring(0, last_line.length() - 1);
}
out.write(last_line);
} finally {
if (out_file == null) {
out.flush();
} else {
out.close();
}
in.close();
}
return in_filename + ".tok";
/*} catch (IOException ex) {
Logger.getLogger(Tokenizer_PTB_Rulebased.class.getName()).log(Level.SEVERE, null, ex);
}*/
}
}