package mia.clustering.ch12; import mia.clustering.ch12.twitter.ByKeyGroupingJob; import org.apache.commons.codec.language.DoubleMetaphone; import org.apache.mahout.common.Parameters; public class CreateTwitterUserDataset { public static void main(String args[]) throws Exception { Parameters params = new Parameters(); params.set("splitPattern", "\t"); String inputDir = "tweets.txt"; String outputDir = "twitter_seqfiles"; params.set("input", inputDir); params.set("output", outputDir); params.set("selectedField", "1"); // tweet params.set("groupByField", "0"); // username ByKeyGroupingJob.startJob(params); DoubleMetaphone filter = new DoubleMetaphone(); // TODO: change these terms?! System.out.println(filter.encode("Loke")); System.out.println(filter.encode("companymancomic")); System.out.println(filter.encode("webcomics")); System.out.println(filter.encode("@comic")); } }