package edu.isistan.uima.unified.casconsumers; import java.io.FileNotFoundException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import net.didion.jwnl.JWNLException; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.eclipse.core.runtime.IProgressMonitor; import org.eclipse.core.runtime.SubProgressMonitor; import org.uimafit.component.JCasConsumer_ImplBase; import org.uimafit.descriptor.ConfigurationParameter; import org.uimafit.descriptor.ExternalResource; import edu.isistan.uima.unified.algorithms.clustering.CMCMClusterer; import edu.isistan.uima.unified.algorithms.clustering.data.Cluster; import edu.isistan.uima.unified.algorithms.clustering.data.CompositeCluster; import edu.isistan.uima.unified.algorithms.clustering.data.DataPoint; import edu.isistan.uima.unified.algorithms.clustering.data.LabeledDataPoint; import edu.isistan.uima.unified.algorithms.clustering.distance.DistanceMeasure; import edu.isistan.uima.unified.algorithms.clustering.distance.SemanticDistanceMeasure; import edu.isistan.uima.unified.algorithms.clustering.linkage.AverageDistance; import edu.isistan.uima.unified.algorithms.clustering.linkage.FurthestNeighbour; import edu.isistan.uima.unified.algorithms.clustering.linkage.LinkageMethod; import edu.isistan.uima.unified.algorithms.clustering.linkage.NearestNeighbour; import edu.isistan.uima.unified.algorithms.similarity.SimilarityMeasure; import edu.isistan.uima.unified.analysisengines.wordnet.JWNLInitialization; import edu.isistan.uima.unified.sharedresources.ClustersResource; import edu.isistan.uima.unified.sharedresources.ProgressMonitorResource; import edu.isistan.uima.unified.typesystems.nlp.Token; import edu.isistan.uima.unified.typesystems.srl.Role; import edu.isistan.uima.unified.typesystems.srl.Structure; import edu.isistan.uima.unified.typesystems.wordnet.Sense; import edu.stanford.nlp.util.ArrayMap; public class ClustererCasConsumer extends JCasConsumer_ImplBase { @ConfigurationParameter(name="jwnl") private String jwnlName; @ConfigurationParameter(name="wordnet") private String wordnetName; // @ConfigurationParameter(name="linkageType") private String linkageType; private LinkageMethod linkage; // @ConfigurationParameter(name="distanceType") private String distanceType; private DistanceMeasure distance; @ConfigurationParameter(name="minimumDistance") private float minimumDistance; // @ExternalResource(key="monitor") private ProgressMonitorResource monitorResource; private IProgressMonitor subMonitor; @ExternalResource(key="clusters") private ClustersResource clustersResource; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); try { //jwnlName = (String) aContext.getConfigParameterValue("jwnl"); //wordnetName = (String) aContext.getConfigParameterValue("wordnet"); if(!JWNLInitialization.isInit()) JWNLInitialization.init(jwnlName, wordnetName); // linkage = null; if(linkageType.equals("Nearest")) { linkage = new NearestNeighbour(); } if(linkageType.equals("Furthest")) { linkage = new FurthestNeighbour(); } if(linkageType.equals("Average")) { linkage = new AverageDistance(); } // Map<String, String> params = new HashMap<String, String>(); params.put("cache", "50000"); if(distanceType.equals("Rago")) params.put("simType", "edu.isistan.uima.unified.algorithms.similarity.Rago"); if(distanceType.equals("Lesk")) params.put("simType", "edu.isistan.uima.unified.algorithms.similarity.Lesk"); if(distanceType.equals("Lin")) params.put("simType", "edu.isistan.uima.unified.algorithms.similarity.Lin"); if(distanceType.equals("JCn")) params.put("simType", "edu.isistan.uima.unified.algorithms.similarity.JCn"); if(distanceType.equals("Lin") || distanceType.equals("JCn")) { params.put("infocontent", "file:" + System.getenv("MODELS_PATH") + "similarity/ic-bnc-resnik-add1.dat"); params.put("mapping", "file:" + System.getenv("MODELS_PATH") + "similarity/domain_independent.txt"); } SimilarityMeasure measure = SimilarityMeasure.newInstance(params); distance = new SemanticDistanceMeasure(measure); // //minimumDistance = 3; } catch (JWNLException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { // subMonitor = new SubProgressMonitor(monitorResource.getMonitor(), 1, SubProgressMonitor.PREPEND_MAIN_LABEL_TO_SUBTASK); subMonitor.subTask("Clustering semantically related concerns"); // AnnotationIndex<Annotation> stAnnotations = aJCas.getAnnotationIndex(Structure.type); AnnotationIndex<Annotation> rAnnotations = aJCas.getAnnotationIndex(Role.type); AnnotationIndex<Annotation> seAnnotations = aJCas.getAnnotationIndex(Sense.type); Map<Cluster, Structure> mapClusters; List<Cluster> initClusters; List<Cluster> clusters; // subMonitor.beginTask(this.getClass().getSimpleName(), 1); // mapClusters = generateClusters(stAnnotations, rAnnotations, seAnnotations, aJCas); initClusters = new ArrayList<Cluster>(mapClusters.keySet()); clusters = CMCMClusterer.recluster(initClusters, linkage, distance, minimumDistance); // for (Cluster cluster : clusters) { List<Structure> structureCluster = new ArrayList<Structure>(); List<Cluster> plain = plainCluster(cluster); for (Cluster c : plain) { if(mapClusters.containsKey(c)) { Structure structure = mapClusters.get(c); structureCluster.add(structure); } } ArrayList<String> externalCluster = new ArrayList<String>(); for(Structure structure : structureCluster) { externalCluster.add(structure.getIdentification()); } clustersResource.getClusters().add(externalCluster); } // subMonitor.worked(1); subMonitor.done(); } @Override public void destroy() { super.destroy(); } private List<Cluster> plainCluster(Cluster cluster) { List<Cluster> plain = new ArrayList<Cluster>(); plain.add(cluster); for(Cluster c : cluster) { plain.addAll(plainCluster(c)); } return plain; } private Map<Cluster, Structure> generateClusters(AnnotationIndex<Annotation> stAnnotations, AnnotationIndex<Annotation> rAnnotations, AnnotationIndex<Annotation> seAnnotations, JCas aJCas) { Map<Cluster, Structure> clusters = new ArrayMap<Cluster, Structure>(); FSIterator<Annotation> structures = stAnnotations.iterator(); while(structures.hasNext()) { Annotation stAnnotation = structures.next(); Structure structure = (Structure) stAnnotation; Cluster cluster = generateCluster(structure, rAnnotations, seAnnotations, aJCas); if(cluster != null) { clusters.put(cluster, structure); } } return clusters; } private Cluster generateCluster(Structure structure, AnnotationIndex<Annotation> rAnnotations, AnnotationIndex<Annotation> seAnnotations, JCas aJCas) { Cluster cluster = new CompositeCluster(); FSIterator<Annotation> roleIterator = rAnnotations.subiterator(structure); List<DataPoint> dataPoints = new ArrayList<DataPoint>(); while(roleIterator.hasNext()) { Annotation rAnnotation = roleIterator.next(); Role role = (Role) rAnnotation; if( role.getKind().equals("VERB") // || role.getKind().equals("SUBJECT") // || role.getKind().equals("DIRECTOBJECT") // || role.getKind().equals("INDIRECTOBJECT") ) dataPoints.addAll(generateDataPoints(role, seAnnotations, aJCas)); } for(DataPoint dataPoint : dataPoints) cluster.add(dataPoint); return cluster; } private List<DataPoint> generateDataPoints(Role role, AnnotationIndex<Annotation> seAnnotations, JCas aJCas) { List<DataPoint> dataPoints = new ArrayList<DataPoint>(); FSArray ocurrencies = role.getOcurrencies(); for(int i = 0; i < ocurrencies.size(); i++) { FSArray ocurrency = (FSArray) ocurrencies.get(i); for(int j = 0; j < ocurrency.size(); j++) { DataPoint dataPoint = null; Token token = (Token) ocurrency.get(j); FSIterator<Annotation> senseIterator = seAnnotations.subiterator(token); if(senseIterator.hasNext()) { Annotation seAnnotation = senseIterator.next(); Sense sense = (Sense) seAnnotation; if(sense.getSense() != null || sense.getSenses() != null) { if(sense.getSense() != null) dataPoint = new LabeledDataPoint(Double.parseDouble(sense.getSense()), token.getPos()); else if(sense.getSenses() != null) dataPoint = new LabeledDataPoint(Double.parseDouble(sense.getSenses().get(0)), token.getPos()); } } if(dataPoint != null) dataPoints.add(dataPoint); } } return dataPoints; } }