package org.bbaw.wsp.cms.dochandler; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.Lemma; import de.mpg.mpiwg.berlin.mpdl.lt.morph.app.MorphologyCache; import de.mpg.mpiwg.berlin.mpdl.lt.text.norm.Normalizer; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Token; import de.mpg.mpiwg.berlin.mpdl.lt.text.tokenize.Tokenizer; public class DocumentTokenizer { private static DocumentTokenizer instance; private MorphologyCache morphCache; public static DocumentTokenizer getInstance() throws ApplicationException { if (instance == null) { instance = new DocumentTokenizer(); instance.init(); } return instance; } private void init() throws ApplicationException { morphCache = MorphologyCache.getInstance(); } public ArrayList<Token> getToken(String inputString, String language, String[] normFunctions) throws ApplicationException { ArrayList<Token> retTokens = null; if (inputString == null || language == null) return null; try { StringReader reader = new StringReader(inputString); Tokenizer tokenizer = new Tokenizer(reader); tokenizer.setLanguage(language); tokenizer.setNormFunctions(normFunctions); retTokens = tokenizer.getTokens(); tokenizer.end(); tokenizer.close(); } catch (IOException e) { throw new ApplicationException(e); } return retTokens; } public String buildStr(ArrayList<Token> tokens, String language, String type) throws ApplicationException { if (tokens == null) return null; StringBuilder strBuilder = new StringBuilder(); if (tokens != null && ! tokens.isEmpty()) { for (int i=0; i<tokens.size(); i++) { Token token = tokens.get(i); String tokenStr = null; if (type.equals("orig")) { tokenStr = token.getContentOrig(); } else if (type.equals("norm")) { tokenStr = token.getContentNorm(); } else if (type.equals("morph")) { String tokenNorm = token.getContentNorm(); ArrayList<Lemma> lemmas = morphCache.getLemmasByFormName(language, tokenNorm, Normalizer.NONE); if (lemmas != null && ! lemmas.isEmpty()) { tokenStr = ""; for (int j=0; j<lemmas.size(); j++) { Lemma l = lemmas.get(j); String lemmaName = l.getLemmaName(); tokenStr = tokenStr + lemmaName + " "; } } else { tokenStr = tokenNorm; } } if (tokenStr != null) strBuilder.append(tokenStr.trim() + " "); } } String result = strBuilder.toString(); if (result.isEmpty()) return null; else return result; } }