/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.fpm.pfpgrowth.convertors.string; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.List; import com.google.common.collect.Lists; import org.apache.hadoop.io.Writable; import org.apache.mahout.common.Pair; /** * A class which collects Top K string patterns * */ public final class TopKStringPatterns implements Writable { private final List<Pair<List<String>,Long>> frequentPatterns; public TopKStringPatterns() { frequentPatterns = Lists.newArrayList(); } public TopKStringPatterns(Collection<Pair<List<String>, Long>> patterns) { frequentPatterns = Lists.newArrayList(); frequentPatterns.addAll(patterns); } public Iterator<Pair<List<String>,Long>> iterator() { return frequentPatterns.iterator(); } public List<Pair<List<String>,Long>> getPatterns() { return frequentPatterns; } public TopKStringPatterns merge(TopKStringPatterns pattern, int heapSize) { List<Pair<List<String>,Long>> patterns = Lists.newArrayList(); Iterator<Pair<List<String>,Long>> myIterator = frequentPatterns.iterator(); Iterator<Pair<List<String>,Long>> otherIterator = pattern.iterator(); Pair<List<String>,Long> myItem = null; Pair<List<String>,Long> otherItem = null; for (int i = 0; i < heapSize; i++) { if (myItem == null && myIterator.hasNext()) { myItem = myIterator.next(); } if (otherItem == null && otherIterator.hasNext()) { otherItem = otherIterator.next(); } if (myItem != null && otherItem != null) { int cmp = myItem.getSecond().compareTo(otherItem.getSecond()); if (cmp == 0) { cmp = myItem.getFirst().size() - otherItem.getFirst().size(); if (cmp == 0) { for (int j = 0; j < myItem.getFirst().size(); j++) { cmp = myItem.getFirst().get(j).compareTo( otherItem.getFirst().get(j)); if (cmp != 0) { break; } } } } if (cmp <= 0) { patterns.add(otherItem); if (cmp == 0) { myItem = null; } otherItem = null; } else if (cmp > 0) { patterns.add(myItem); myItem = null; } } else if (myItem != null) { patterns.add(myItem); myItem = null; } else if (otherItem != null) { patterns.add(otherItem); otherItem = null; } else { break; } } return new TopKStringPatterns(patterns); } @Override public void readFields(DataInput in) throws IOException { frequentPatterns.clear(); int length = in.readInt(); for (int i = 0; i < length; i++) { List<String> items = Lists.newArrayList(); int itemsetLength = in.readInt(); long support = in.readLong(); for (int j = 0; j < itemsetLength; j++) { items.add(in.readUTF()); } frequentPatterns.add(new Pair<List<String>,Long>(items, support)); } } @Override public void write(DataOutput out) throws IOException { out.writeInt(frequentPatterns.size()); for (Pair<List<String>,Long> pattern : frequentPatterns) { out.writeInt(pattern.getFirst().size()); out.writeLong(pattern.getSecond()); for (String item : pattern.getFirst()) { out.writeUTF(item); } } } @Override public String toString() { StringBuilder sb = new StringBuilder(); String sep = ""; for (Pair<List<String>,Long> pattern : frequentPatterns) { sb.append(sep); sb.append(pattern.toString()); sep = ", "; } return sb.toString(); } }