/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.knittingboar.records; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.mahout.math.Vector; import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder; import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder; import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder; import org.apache.mahout.math.RandomAccessSparseVector; import com.cloudera.knittingboar.utils.Utils; import com.google.common.collect.ConcurrentHashMultiset; import com.google.common.collect.Lists; import com.google.common.collect.Multiset; /** * RecordFactory for * https://github.com/JohnLangford/vowpal_wabbit/wiki/Rcv1-example * * @author jpatterson * */ public class RCV1RecordFactory implements RecordFactory { public static final int FEATURES = 10000; ConstantValueEncoder encoder = null; public RCV1RecordFactory() { this.encoder = new ConstantValueEncoder("body_values"); } public static void ScanFile(String file, int debug_break_cnt) throws IOException { ConstantValueEncoder encoder_test = new ConstantValueEncoder("test"); BufferedReader reader = null; // Collection<String> words int line_count = 0; Multiset<String> class_count = ConcurrentHashMultiset.create(); Multiset<String> namespaces = ConcurrentHashMultiset.create(); try { // System.out.println( newsgroup ); reader = new BufferedReader(new FileReader(file)); String line = reader.readLine(); while (line != null && line.length() > 0) { // shard_writer.write(line + "\n"); // out += line; String[] parts = line.split(" "); // System.out.println( "Class: " + parts[0] ); class_count.add(parts[0]); namespaces.add(parts[1]); line = reader.readLine(); line_count++; Vector v = new RandomAccessSparseVector(FEATURES); for (int x = 2; x < parts.length; x++) { // encoder_test.addToVector(parts[x], v); // System.out.println( parts[x] ); String[] feature = parts[x].split(":"); int index = Integer.parseInt(feature[0]) % FEATURES; double val = Double.parseDouble(feature[1]); // System.out.println( feature[1] + " = " + val ); if (index < FEATURES) { v.set(index, val); } else { System.out.println("Could Hash: " + index + " to " + (index % FEATURES)); } } Utils.PrintVectorSectionNonZero(v, 10); System.out.println("###"); if (line_count > debug_break_cnt) { break; } } System.out.println("Total Rec Count: " + line_count); System.out.println("-------------------- "); System.out.println("Classes"); for (String word : class_count.elementSet()) { System.out.println("Class " + word + ": " + class_count.count(word) + " "); } System.out.println("-------------------- "); System.out.println("NameSpaces:"); for (String word : namespaces.elementSet()) { System.out.println("Namespace " + word + ": " + namespaces.count(word) + " "); } /* * TokenStream ts = analyzer.tokenStream("text", reader); * ts.addAttribute(CharTermAttribute.class); * * // for each word in the stream, minus non-word stuff, add word to * collection while (ts.incrementToken()) { String s = * ts.getAttribute(CharTermAttribute.class).toString(); * //System.out.print( " " + s ); //words.add(s); out += s + " "; } */ } finally { reader.close(); } // return out + "\n"; } // doesnt really do anything in a 2 class dataset @Override public String GetClassnameByID(int id) { return String.valueOf(id); // this.newsGroups.values().get(id); } /** * Processes single line of input into: - target variable - Feature vector * * Right now our hash function is simply "modulo" * * @throws Exception */ public int processLine(String line, Vector v) throws Exception { // p.269 --------------------------------------------------------- // Map<String, Set<Integer>> traceDictionary = new TreeMap<String, // Set<Integer>>(); int actual = 0; String[] parts = line.split(" "); actual = Integer.parseInt(parts[0]); // dont know what to do the the "namespace" "f" for (int x = 2; x < parts.length; x++) { String[] feature = parts[x].split(":"); int index = Integer.parseInt(feature[0]) % FEATURES; double val = Double.parseDouble(feature[1]); if (index < FEATURES) { v.set(index, val); } else { System.out .println("Could Hash: " + index + " to " + (index % FEATURES)); } } // System.out.println("\nEOL\n"); return actual; } @Override public List<String> getTargetCategories() { List<String> out = new ArrayList<String>(); // for ( int x = 0; x < this.newsGroups.size(); x++ ) { // System.out.println( x + "" + this.newsGroups.values().get(x) ); out.add("0"); out.add("1"); // } return out; } }