/******************************************************************************* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.alignment.framework.uima; //import static org.uimafit.factory.AnalysisEngineFactory.createPrimitiveDescription; //import static org.uimafit.factory.CollectionReaderFactory.createCollectionReader; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; import java.io.IOException; import java.util.BitSet; import java.util.HashMap; import java.util.HashSet; import java.util.Vector; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.cas.CAS; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.languagetool.LanguageToolLemmatizer; import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger; import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter; import de.tudarmstadt.ukp.dkpro.core.stopwordremover.StopWordRemover; public class Toolkit { public static HashMap<String,String> posMapping = new HashMap<String, String>(); public static final String LF = System.getProperty("line.separator"); public static double[] normalizeVector(double[] vector) { double length=0; for(double v_i: vector) { length+=Math.pow(v_i, 2); } length = Math.sqrt(length); for(int i =0;i< vector.length;i++) { vector[i]=vector[i] / length; } return vector; } public static double[] calu(double[] vector) { double length=0; for(double v_i: vector) { length+=Math.pow(v_i, 2); } length = Math.sqrt(length); for(int i =0;i< vector.length;i++) { vector[i]=vector[i] / length; } return vector; } public static double[] calculateNminus1distance(HashSet<double[]> points, double point) { double[] centroid = null; double dem = 0; for (double[] element : points) { dem++; if(centroid==null) { centroid = element.clone(); }else { for(int i=0;i<element.length;i++) { centroid[i]+=element[i]; } } } for(int i =0;i< centroid.length;i++) { centroid[i]=centroid[i] / dem; } return centroid; } public static double[] calculateCentroid(HashSet<double[]> points) { double[] centroid = null; double dem = 0; for (double[] element : points) { dem++; if(centroid==null) { centroid = element.clone(); }else { for(int i=0;i<element.length;i++) { centroid[i]+=element[i]; } } } for(int i =0;i< centroid.length;i++) { centroid[i]=centroid[i] / dem; } return centroid; } public static double cosineSimilarity(double[] vector1, double[] vector2) { double sum = 0; double asum = 0; double bsum = 0; for(int i = 0; i<vector2.length;i++) { sum+= vector1[i]*vector2[i]; asum+=vector1[i]*vector1[i]; bsum+=vector2[i]*vector2[i]; } return sum / (Math.sqrt(asum)* Math.sqrt(bsum)); } public static double bitSimilarity(BitSet vector1, BitSet vector2, boolean normalize) { BitSet result =(BitSet) vector1.clone(); result.and(vector2); double max = Math.max(vector1.cardinality(), vector2.cardinality()); if(!normalize) { return result.cardinality(); } else { // System.out.println((result.cardinality()) / max); if(max == 0.0) { return max; } else { return (result.cardinality()) / max; } } } public static String[] process(String input,PosGetter getter,AnalysisEngineDescription... aeds) { try { CollectionReaderDescription cr = createReaderDescription( StringReader.class , StringReader.PARAM_CONTENT, input, StringReader.PARAM_LANGUAGE, "de" ); AnalysisEngineDescription cc = createEngineDescription(StringWriter.class); StringWriter.getter=getter; AnalysisEngineDescription[] aeds2 = new AnalysisEngineDescription[aeds.length+1]; for(int i=0;i<aeds.length;i++) { aeds2[i]=aeds[i]; } aeds2[aeds.length]=cc; runPipeline(cr,aeds2); return (String[]) StringWriter.mContent; } catch (ResourceInitializationException e) { e.printStackTrace(); } catch (UIMAException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } public static String posMatcher(String inputPos) { String ret; if ((ret=posMapping.get(inputPos))==null) { return "unknown"; } else { return ret; } } public static void initializePOS() { posMapping.put("CC", "coordinatingConjunction"); posMapping.put("CD", "numeral"); posMapping.put("DT", "determiner"); posMapping.put("IN", "conjunction"); posMapping.put("JJ", "adjective"); posMapping.put("JJR", "adjective"); posMapping.put("JJS", "adjective"); posMapping.put("NN", "noun"); posMapping.put("NNS", "noun"); posMapping.put("NP", "noun"); posMapping.put("NPS", "noun"); posMapping.put("PDT", "adverb"); posMapping.put("PP", "personalPronoun"); posMapping.put("PP$", "possessivePronoun"); posMapping.put("RB", "adverb"); posMapping.put("RBR", "adverb"); posMapping.put("RBR", "adverb"); posMapping.put("RP", "particle"); posMapping.put("UH", "interjection"); posMapping.put("VB", "verb"); posMapping.put("VBD", "verb"); posMapping.put("VBG", "verb"); posMapping.put("VBN", "verb"); posMapping.put("VBP", "verb"); posMapping.put("VBZ", "verb"); posMapping.put("VV", "verb"); posMapping.put("VVD", "verb"); posMapping.put("VVG", "verb"); posMapping.put("VVN", "verb"); posMapping.put("VVP", "verb"); posMapping.put("VVZ", "verb"); posMapping.put("WDT", "relativePronoun"); posMapping.put("WP", "interrogativePronoun"); posMapping.put("WP$", "interrogativePronoun"); posMapping.put("WRB", "interrogativePronoun"); } public static void initializePOSGerman() { posMapping.put("CC", "coordinatingConjunction"); posMapping.put("CARD", "numeral"); posMapping.put("DT", "determiner"); posMapping.put("IN", "conjunction"); posMapping.put("ADJA", "adjective"); posMapping.put("ADJD", "adjective"); posMapping.put("JJS", "adjective"); posMapping.put("NN", "noun"); posMapping.put("NE", "noun"); posMapping.put("NP", "noun"); posMapping.put("NPS", "noun"); posMapping.put("ADV", "adverb"); posMapping.put("PP", "personalPronoun"); posMapping.put("PP$", "possessivePronoun"); posMapping.put("RB", "adverb"); posMapping.put("RBR", "adverb"); posMapping.put("RBR", "adverb"); posMapping.put("RP", "particle"); posMapping.put("UH", "interjection"); posMapping.put("VVFIN", "verb"); posMapping.put("VVIMP", "verb"); posMapping.put("VVINF", "verb"); posMapping.put("VVIZU", "verb"); posMapping.put("VVPP", "verb"); posMapping.put("VAFIN", "verb"); posMapping.put("VAIMP", "verb"); posMapping.put("VAINF", "verb"); posMapping.put("VAPP", "verb"); posMapping.put("VMFIN", "verb"); posMapping.put("VMINF", "verb"); posMapping.put("VMPP", "verb"); posMapping.put("WDT", "relativePronoun"); posMapping.put("WP", "interrogativePronoun"); posMapping.put("WP$", "interrogativePronoun"); posMapping.put("WRB", "interrogativePronoun"); } public static String[] lemmatizeGerman(String text) throws ResourceInitializationException { if (text == null || text.isEmpty()) { return null; } AnalysisEngineDescription seg; try { seg = createEngineDescription(OpenNlpSegmenter.class); AnalysisEngineDescription sw = createEngineDescription(StopWordRemover.class, StopWordRemover.PARAM_MODEL_LOCATION , new String[]{"src/main/resources/snowball_german_stopwords.txt"} ); AnalysisEngineDescription pos = createEngineDescription(OpenNlpPosTagger.class, OpenNlpPosTagger.PARAM_LANGUAGE,"de" ); AnalysisEngineDescription lem = createEngineDescription(LanguageToolLemmatizer.class //LanguageToolLemmatizer.PARAM_LANGUAGE,"de" ); // AnalysisEngineDescription pos = createEngineDescription(TreeTaggerPosLemmaTT4J.class, // TreeTaggerPosLemmaTT4J.PARAM_LANGUAGE,"de"); String[] result; result = process(text, new PosGetter(), seg,sw,lem,pos); return result; } catch (ResourceInitializationException e) { e.printStackTrace(); } return null; } public static String[] lemmatizeEnglish(String text) throws ResourceInitializationException { if (text == null || text.isEmpty()) { return null; } try { CollectionReaderDescription cr = createReaderDescription( StringReader.class , StringReader.PARAM_CONTENT, text, StringReader.PARAM_LANGUAGE, "en" ); AnalysisEngineDescription seg = createEngineDescription(OpenNlpSegmenter.class); // AnalysisEngineDescription seg = createEngineDescription(StanfordSegmenter.class, StanfordSegmenter.PARAM_LANGUAGE,"en"); //AnalysisEngineDescription lemma = createEngineDescription(StanfordLemmatizer.class); AnalysisEngineDescription lemma = createEngineDescription(LanguageToolLemmatizer.class //LanguageToolLemmatizer.PARAM_LANGUAGE,"de" ); // AnalysisEngineDescription pos = createEngineDescription(StanfordPosTagger.class ); AnalysisEngineDescription pos = createEngineDescription(OpenNlpPosTagger.class ); HashSet<String> swords = new HashSet<String>(); swords.add("src/main/resources/stopwords_english_punctuation.txt"); AnalysisEngineDescription sw = createEngineDescription(StopWordRemover.class, StopWordRemover.PARAM_MODEL_LOCATION , swords //StopWordRemover.PARAM_STOP_WORD_LIST_FILE_NAMES, new String[]{"/home/matuschek/UBY_HOME/resources/snowball_english_stopwords.txt"} ); // AnalysisEngineDescription pos = createEngineDescription(TreeTaggerPosLemmaTT4J.class, // TreeTaggerPosLemmaTT4J.PARAM_LANGUAGE,"en"); StringWriter.getter=new PosGetter(); AnalysisEngineDescription cc = createEngineDescription(StringWriter.class); // AnalysisEngineDescription tagger = createEngineDescription(OpenNlpPosTagger.class); runPipeline(cr, seg,sw, lemma, pos, cc); String[] result = (String[]) StringWriter.mContent; return result; } catch (Exception e) { e.printStackTrace(); } return null; } public static class PosGetter { /* public Object retrieveData_old2(JCas cas) { String ret = ""; FSIterator<org.apache.uima.jcas.tcas.Annotation> ai = cas.getAnnotationIndex(StopWord.type).iterator(); org.apache.uima.jcas.tcas.Annotation nextStop = ai.next(); for(org.apache.uima.jcas.tcas.Annotation annot : cas.getAnnotationIndex(Token.type)) { if(annot.getBegin() == nextStop.getBegin() && ai.hasNext()) { nextStop = ai.next(); } else { //ret+=((Token)annot).getLemma().getValue()+"#"+((Token)annot).getPos().getPosValue().toLowerCase().charAt(0)+" "; ret+=annot.getCoveredText()+"#"+((Token)annot).getPos().getPosValue().replace("J","A").toLowerCase().charAt(0)+" "; } } return ret; }*/ public Object retrieveData(CAS cas) { Vector<String> rets= new Vector<String>(); // System.out.println(cas.getAnnotationIndex(Token.type).size()); for(AnnotationFS annot : cas.getAnnotationIndex()) { if(!Token.class.toString().contains(annot.getType().getName())) { // System.out.println(annot.getType().getName()); // System.out.println(Token.class); // System.out.println("No Token"); continue; } String tok=annot.getCoveredText(); tok=tok.replaceAll("@[.:;,\"'´`]","" ); if (tok.length()>0 && ((Token)annot).getPos() != null ) { //ret+=annot.getCoveredText()+" "; // rets.add(((Token)annot).getLemma().getValue());//+"#"+((Token)annot).getPos().getPosValue()); // System.out.println("CT: "+annot.getCoveredText()); // System.out.println("Lemma: "+((Token)annot).getLemma().getValue()); // System.out.println("POS "+((Token)annot).getPos().getPosValue()); rets.add(((Token)annot).getLemma().getValue()+"#"+posMatcher(((Token)annot).getPos().getPosValue())); //TODO POS-Mapping richtig umsetzen!!!! } } String[] a = new String[rets.size()]; return rets.toArray(a); //ret; } } public static String[] createMultiwords(String[] string1) { String[] result = new String[string1.length-1]; for(int i = 0; i< string1.length-1;i++) { String temp = string1[i]+"_"+string1[i+1]; result[i]= temp; } return result; } }