package ca.pfv.spmf.algorithms.clustering.text_clusterer; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import ca.pfv.spmf.tools.MemoryLogger; /* This file is copyright (c) 2008-2012 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ /** * @author Sabarish Raghu * ClusterAlgo is an implementation of text clustering algorithm. * * Input is of TSV format which is of format RecordId \t Record * Eg: 1 The document about a cat * Output is of TSV format which has RecordId \t clusternumber * */ public class TextClusterAlgo { static HashSet<String> allWords=new HashSet<String>(); static HashMap<Integer, Integer> idMap=new HashMap<Integer, Integer>(); //map between the recordId and its corresponding index. long startTimestamp = 0; // last execution start time long endTimeStamp = 0; // last execution end time boolean stemFlag; //stemming to be done or not boolean stopWordFlag; //stop words to be removed or not /** * @param inputPath input file path * @param outputPat output file path * @param stemFlag * @param stopWordFlag */ public void runAlgorithm(String inputPath, String outputPath, boolean stemFlag, boolean stopWordFlag) { this.stemFlag=stemFlag; this.stopWordFlag=stopWordFlag; runAlgorithm(inputPath, outputPath); } /** * @param inputPath input file path * @param outputPat output file path */ public void runAlgorithm(String inputPath, String outputPath) { startTimestamp=System.currentTimeMillis(); try { BufferedReader inputReader=new BufferedReader(new FileReader(new File(inputPath))); if(inputPath!=null) { BufferedWriter outputWriter=new BufferedWriter(new FileWriter(new File(outputPath))); ArrayList<Record> records=loadInput(inputReader,stemFlag,stopWordFlag); for(Record record:records) { double tfIdfVector[]=new double[allWords.size()]; int vectorIncrementer=0; for(String word:allWords) { tfIdfVector[vectorIncrementer]=FindTFIDF(record.getAttribute(), word, records); vectorIncrementer++; } record.setTfVector(tfIdfVector); } double sim[][]=new double[records.size()][records.size()]; for(int i=0;i<records.size();i++) { for(int j=0;j<records.size();j++) { sim[i][j]=calculateSimilarity(records.get(i).getTfVector(),records.get(j).getTfVector()); } } ArrayList<SimilarRecords> similarRecordPairs=new ArrayList<SimilarRecords>(); for(int i=0;i<records.size();i++) { double max=0.0;int ipos=0;int jpos=0; for(int j=0;j<records.size();j++) { if(i!=j) { if(sim[i][j]>max) { max=sim[i][j]; ipos=i; jpos=j; } } } SimilarRecords pair=new SimilarRecords(); pair.setRecord1Pos(ipos); pair.setRecord2Pos(jpos); pair.setSimilarity(max); similarRecordPairs.add(pair); } Set<TextCluster> clusters=new HashSet<TextCluster>(); for(SimilarRecords similarPair:similarRecordPairs) { int i=similarPair.getRecord1Pos(); int j=similarPair.getRecord2Pos(); ArrayList<Integer> tempList=new ArrayList<Integer>(); TextCluster result=new TextCluster(); tempList.add(i); tempList.add(j); result.setCluster(tempList); clusters.add(result); } Set<TextCluster> clusterSet=new HashSet<TextCluster>(clusters); Iterator<TextCluster> clusterIterator=clusterSet.iterator(); int clusterNum=0; outputWriter.write("RecordId\tClusternum\n"); while(clusterIterator.hasNext()) { TextCluster output=(TextCluster) clusterIterator.next(); ArrayList<Integer> list=output.getCluster(); for(int i=0;i<list.size();i++) { outputWriter.write(idMap.get(list.get(i))+"\t"+clusterNum+"\n"); } clusterNum++; } outputWriter.close(); endTimeStamp=System.currentTimeMillis(); } else { System.out.println("Please pass the path of the input"); } } catch(Exception e) { System.out.println("Either file didn't exist or error while clustering"); e.printStackTrace(); } } /** * Print statistics of the latest execution to System.out. */ public void printStatistics() { System.out.println("========== KMEANS - STATS ============"); System.out.println(" Total time ~: " + (endTimeStamp - startTimestamp) + " ms"); System.out.println(" Max memory:" + MemoryLogger.getInstance().getMaxMemory() + " mb "); System.out.println("====================================="); } /** * Calculates the similarity between two documents by calculation between the vectors of the corresponding documents. * @param tfIdfVector1 tfIdf value of record 1 * @param tfIdfVector2 tfIdf value of record 2 * @return similarity value between the record's vectors */ private static double calculateSimilarity(double[] tfIdfVector1, double[] tfIdfVector2) { // TODO Auto-generated method stub double similarity=0; for(int i=0;i<tfIdfVector1.length;i++){ similarity+=tfIdfVector1[i]*tfIdfVector2[i]; } return similarity; } /** * load the input as objects of records * @param inputReader the reader object to read input * @param stemFlag if true, do the stemming; else, do not stem. * @param stopWordFlag if true, do the stop word removal; else, do not remove stop words. * @return the list of records */ private static ArrayList<Record> loadInput(BufferedReader inputReader, boolean stemFlag, boolean stopWordFlag) { // TODO Auto-generated method stub ArrayList<Record> records=new ArrayList<Record>(); String currentLine; String[] line; int recordId; String words[]; try { int i=0; while((currentLine=inputReader.readLine())!=null) { line=currentLine.split("\t",-1); Record record=new Record(); recordId=Integer.parseInt(line[0]); record.setRecordId(recordId); String attribute=line[1].toLowerCase(); attribute=attribute.replaceAll("[^a-zA-Z0-9]+"," "); if(stopWordFlag==true) { StopWordAnalyzer analyzer=new StopWordAnalyzer(); attribute=analyzer.removeStopWords(attribute); } idMap.put(i, recordId); if(stemFlag==true) { PorterStemmer stemmer=new PorterStemmer(); attribute=stemmer.stem(attribute); } words=attribute.split(" "); for(String word:words) { allWords.add(word); } record.setAttribute(attribute); records.add(record); i++; } return records; } catch(Exception e) { e.printStackTrace(); } return records; } /** To find the TFIDF value for a given document and a given term in the whole set of documents * @param document The Text record in the input file. * @param term The term in the allWords * @param records The whole record collection of the input file. * @return tfidf value of the given document and the term */ private static double FindTFIDF(String document, String term, ArrayList<Record> records) { double tf = FindTermFrequency(document, term); float idf = FindInverseDocumentFrequency(term,records); return tf * idf; } /** * To find the no. of document that contains the term in whole document collection * i.e.; log of the ratio of total no of document in the collection to the no. of document containing the term * we can also use Math.Log(occurance/(1+documentCollection.size)) to deal with divide by zero case; *@param term The term in the allWords *@param records The whole record collection of the input file. *@return the inverse document frequency */ private static float FindInverseDocumentFrequency(String term, ArrayList<Record> records) { int occurance=0; for(Record record:records) { if(record.getAttribute().contains(term)) { occurance++; } } return (float)Math.log((float)occurance / (1+(float)records.size())); } /** * To find the ratio of no of occurance of term t in document d to the total no of terms in the document *@param document The Text record in the input file *@param term The term in the allWords *@return the term frequency of term in the document */ private static double FindTermFrequency(String document, String term) { int occurance=0; String[] words=document.split(" "); for(String word:words) { if(word.equalsIgnoreCase(term)) { occurance++; } } return (double)((float)occurance / (float)(words.length)); } }