/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.utils.clustering; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import org.apache.mahout.clustering.AbstractCluster; import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable; import org.apache.mahout.clustering.iterator.ClusterWritable; import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.Vector; import org.codehaus.jackson.map.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Dump cluster info to JSON formatted lines. Heavily inspired by * ClusterDumperWriter.java and CSVClusterWriter.java * */ public class JsonClusterWriter extends AbstractClusterWriter { private final String[] dictionary; private final int numTopFeatures; private final ObjectMapper jxn; private static final Logger log = LoggerFactory.getLogger(JsonClusterWriter.class); private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}"); public JsonClusterWriter(Writer writer, Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints, DistanceMeasure measure, int numTopFeatures, String[] dictionary) { super(writer, clusterIdToPoints, measure); this.numTopFeatures = numTopFeatures; this.dictionary = dictionary; jxn = new ObjectMapper(); } /** * Generate HashMap with cluster info and write as a single JSON formatted * line */ @Override public void write(ClusterWritable clusterWritable) throws IOException { Map<String, Object> res = new HashMap<>(); // get top terms if (dictionary != null) { List<Object> topTerms = getTopFeaturesList(clusterWritable.getValue() .getCenter(), dictionary, numTopFeatures); res.put("top_terms", topTerms); } else { res.put("top_terms", new ArrayList<>()); } // get human-readable cluster representation Cluster cluster = clusterWritable.getValue(); res.put("cluster_id", cluster.getId()); if (dictionary != null) { Map<String,Object> fmtStr = cluster.asJson(dictionary); res.put("cluster", fmtStr); // get points List<Object> points = getPoints(cluster, dictionary); res.put("points", points); } else { res.put("cluster", new HashMap<>()); res.put("points", new ArrayList<>()); } // write JSON Writer writer = getWriter(); writer.write(jxn.writeValueAsString(res) + "\n"); } /** * Create a List of HashMaps containing top terms information * * @return List<Object> */ public List<Object> getTopFeaturesList(Vector vector, String[] dictionary, int numTerms) { List<TermIndexWeight> vectorTerms = new ArrayList<>(); for (Vector.Element elt : vector.nonZeroes()) { vectorTerms.add(new TermIndexWeight(elt.index(), elt.get())); } // Sort results in reverse order (i.e. weight in descending order) Collections.sort(vectorTerms, new Comparator<TermIndexWeight>() { @Override public int compare(TermIndexWeight one, TermIndexWeight two) { return Double.compare(two.weight, one.weight); } }); List<Object> topTerms = new ArrayList<>(); for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) { int index = vectorTerms.get(i).index; String dictTerm = dictionary[index]; if (dictTerm == null) { log.error("Dictionary entry missing for {}", index); continue; } Map<String, Object> term_entry = new HashMap<>(); term_entry.put(dictTerm, vectorTerms.get(i).weight); topTerms.add(term_entry); } return topTerms; } /** * Create a List of HashMaps containing Vector point information * * @return List<Object> */ public List<Object> getPoints(Cluster cluster, String[] dictionary) { List<Object> vectorObjs = new ArrayList<>(); List<WeightedPropertyVectorWritable> points = getClusterIdToPoints().get( cluster.getId()); if (points != null) { for (WeightedPropertyVectorWritable point : points) { Map<String, Object> entry = new HashMap<>(); Vector theVec = point.getVector(); if (theVec instanceof NamedVector) { entry.put("vector_name", ((NamedVector) theVec).getName()); } else { String vecStr = theVec.asFormatString(); // do some basic manipulations for display vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_"); entry.put("vector_name", vecStr); } entry.put("weight", String.valueOf(point.getWeight())); try { entry.put("point", AbstractCluster.formatVectorAsJson(point.getVector(), dictionary)); } catch (IOException e) { log.error("IOException: ", e); } vectorObjs.add(entry); } } return vectorObjs; } /** * Convenience class for sorting terms * */ private static class TermIndexWeight { private final int index; private final double weight; TermIndexWeight(int index, double weight) { this.index = index; this.weight = weight; } } }