/** * This is a Java port of the TreeTagger's cmd/utf8-tokenize.perl script * after it was altered in a way that made it unusable for invokation. */ package de.unihd.dbs.uima.annotator.treetagger; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.util.ArrayList; import java.util.EnumSet; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; /** * * @author Helmut Schmid, IMS, University of Stuttgart * Serge Sharoff, University of Leeds * Julian Zell, University of Heidelberg * */ public class TreeTaggerTokenizer { public static enum Flag { ENGLISH, FRENCH, ITALIAN, GALICIAN, Z; public static EnumSet<Flag> getSet(String flagName) { EnumSet<Flag> set = EnumSet.noneOf(Flag.class); if(flagName == null) return set; if(flagName.contains("-e")) set.add(ENGLISH); if(flagName.contains("-f")) set.add(FRENCH); if(flagName.contains("-i")) set.add(ITALIAN); if(flagName.contains("-g")) set.add(GALICIAN); if(flagName.contains("-z")) set.add(Z); return set; } } EnumSet<Flag> flags = null; private File abbreviationsFile = null; private String PChar = "\\[¿¡\\{\\(\\`\"‚„†‡‹‘’“”•–—›'"; private String FChar = "\\]\\}\\'\\`\"\\),;:\\!\\?\\%‚„…†‡‰‹‘’“”•–—›"; private String FClitic = ""; private String PClitic = ""; private ArrayList<String> abbreviations = new ArrayList<String>(); public TreeTaggerTokenizer(String abbreviationsFile, EnumSet<Flag> flags) throws RuntimeException { this.flags = flags; if(abbreviationsFile != null) { this.abbreviationsFile = new File(abbreviationsFile); if(!this.abbreviationsFile.exists() || !this.abbreviationsFile.canRead()) { Logger.printError(this.getClass(), "Couldn't read abbreviations file " + abbreviationsFile + " (exist:" + this.abbreviationsFile.exists() + ",read:" + this.abbreviationsFile.canRead() + ")"); throw new RuntimeException(); } BufferedReader br = null; try { br = new BufferedReader(new FileReader(this.abbreviationsFile)); String line = null; while((line = br.readLine()) != null) { line = line.replaceAll("^[ \t\r\n]+", ""); line = line.replaceAll("[ \t\r\n]+$", ""); if(!line.matches("^(#.*|\\s$)")) { abbreviations.add(line); } } } catch(Exception e) { e.printStackTrace(); } finally { if(br != null) { try { br.close(); } catch(Exception e) { e.printStackTrace(); } } } } if(flags.contains(Flag.ENGLISH)) { FClitic = "'(s|re|ve|d|m|em|ll)|n't"; } if(flags.contains(Flag.ITALIAN)) { PClitic = "[dD][ae]ll'|[nN]ell'|[Aa]ll'|[lLDd]'|[Ss]ull'|[Qq]uest'|[Uu]n'|[Ss]enz'|[Tt]utt'"; } if(flags.contains(Flag.FRENCH)) { PClitic = "[dcjlmnstDCJLNMST]'|[Qq]u'|[Jj]usqu'|[Ll]orsqu'"; FClitic = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m'|-moi|-nous|-on|-toi|-tu|-t'|-vous|-en|-y|-ci|-l"; } if(flags.contains(Flag.GALICIAN)) { FClitic = "-la|-las|-lo|-los|-nos"; } } public List<String> tokenize(String in) { StringBuilder outBuf = new StringBuilder(); for(String text : in.split("\n")) { // replace newlines and tab characters with blanks text = text.replaceAll("[\r\n\t]", " "); // replace blanks within SGML tags text = text.replaceAll("(<[^<> ]*) ([^<>]*>)", "$1\377$2"); // replace whitespace with a special character text = text.replaceAll("[\\u2000-\\u200A \\u202F\\u205F\\u3000\\u00A0\\u1680\\u180E]", "\376"); // restore SGML tags text = text.replaceAll("\377", " "); text = text.replaceAll("\376", "\377"); // prepare SGML-Tags for tokenization text = text.replaceAll("(<[^<>]*>)", "\377$1\377"); text = text.replaceAll("^\377", ""); text = text.replaceAll("\377$", ""); text = text.replaceAll("\377\377\377*", "\377"); String[] texts = text.split("\377"); for(String line : texts) { if(line.matches("^<.*>$")) { // SGML tag outBuf.append(line + "\n"); } else { // add a blank at the beginning and the end of each segment line = " " + line + " "; // insert missing blanks after punctuation line = line.replaceAll("\\.\\.\\.", " ... "); line = line.replaceAll("([;\\!\\?])([^ ])", "$1 $2"); line = line.replaceAll("([.,:])([^ 0-9.])", "$1 $2"); String[] lines = line.split(" "); for(String token : lines) { // remove some whitespaces that \s doesn't catch if(token.equals("")) continue; String suffix = ""; // separate punctuation and parentheses from words Boolean finished = false; Matcher m; do { finished = true; // cut off preceding punctuation m = Pattern.compile("^([" + PChar + "])(.)").matcher(token); if(m.find()) { token = token.replaceAll("^([" + PChar + "])(.)", "$2"); outBuf.append(m.group(1) + "\n"); finished = false; } // cut off trailing punctuation m = Pattern.compile("(.)([" + FChar + "])$").matcher(token); if(m.find()) { token = token.replaceAll("(.)([" + FChar + "])$", "$1"); suffix = m.group(2) + "\n" + suffix; finished = false; } // cut off trailing periods if punctuation precedes m = Pattern.compile("([" + FChar + "])\\.$").matcher(token); if(m.find()) { token = token.replaceAll("([" + FChar + "])\\.$", ""); suffix = ".\n" + suffix; if(token.equals("")) { token = m.group(1); } else { suffix = m.group(1) + "\n" + suffix; } finished = false; } } while(!finished); // handle explicitly listed tokens if(abbreviations.contains(token)) { outBuf.append(token + "\n" + suffix); continue; } // abbreviations of the form A. or U.S.A. if(token.matches("^([A-Za-z-]\\.)+$")) { outBuf.append(token + "\n" + suffix); continue; } // disambiguate periods m = Pattern.compile("^(..*)\\.$").matcher(token); if(m.matches() && !line.equals("...") && !(flags.contains(Flag.GALICIAN) && token.matches("^[0-9]+\\.$"))) { token = m.group(1); suffix = ".\n" + suffix; if(abbreviations.contains(token)) { outBuf.append(token + "\n" + suffix); continue; } } // cut off clitics while(true) { m = Pattern.compile("^(--)(.)").matcher(token); if(!m.find()) { break; } token = token.replaceAll("^(--)(.)", "$2"); outBuf.append(m.group(1) + "\n"); } if(!PClitic.equals("")) { while(true) { m = Pattern.compile("^(" + PClitic + ")(.)").matcher(token); if(!m.find()) { break; } token = token.replaceAll("^(" + PClitic + ")(.)", "$2"); outBuf.append(m.group(1) + "\n"); } } while(true) { m = Pattern.compile("(.)(--)$").matcher(token); if(!m.find()) { break; } token = token.replaceAll("(.)(--)$", "$1"); suffix = m.group(2) + "\n" + suffix; } if(!FClitic.equals("")) { while(true) { m = Pattern.compile("(.)(" + FClitic + ")$").matcher(token); if(!m.find()) { break; } token = token.replaceAll("(.)(" + FClitic + ")$", "$1"); suffix = m.group(2) + "\n" + suffix; } } outBuf.append(token + "\n" + suffix); } } } } LinkedList<String> outList = new LinkedList<String>(); for(String s : outBuf.toString().split("\n")) { s = s.replaceAll("^[\\p{javaWhitespace}\\p{gc=Cc}]+", ""); s = s.replaceAll("[\\p{javaWhitespace}\\p{gc=Cc}]+$", ""); outList.add(s); } return outList; } }