/*
* Copyright (C) 2015 Adrien Guille <adrien.guille@univ-lyon2.fr>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package main.java.fr.ericlab.sondy.core.text.index;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import org.apache.commons.io.FileUtils;
/**
*
* @author Adrien GUILLE, Laboratoire ERIC, Université Lumière Lyon 2
*/
public class GlobalIndexer {
int numberOfThreads;
boolean mention;
int messageCount;
public GlobalIndexer(int nbThreads, boolean m){
numberOfThreads = nbThreads;
mention = m;
}
public void index(String directory) throws InterruptedException, FileNotFoundException, IOException {
ArrayList<HashMap<String,Short>> mapList = new ArrayList<>(1500);
String[] fileArray = new File(directory).list();
int fileCount = 0;
for(String filename : fileArray){
if(filename.endsWith(".text"))
fileCount++;
}
int fileCountPerThread = fileCount/numberOfThreads;
LinkedList<Indexer> indexers = new LinkedList<>();
for(int i = 0; i < numberOfThreads; i++){
int upperBound = (i==numberOfThreads-1)?fileCount-1:fileCountPerThread*(i+1);
indexers.add(new Indexer(i,directory,fileCountPerThread*i+1,upperBound,mention,1));
indexers.get(i).start();
}
for(Indexer indexer : indexers){
indexer.join();
}
int messageCountDistribution[] = new int[fileCount];
for(int i = 0; i < numberOfThreads; i++){
Indexer indexer = indexers.get(i);
mapList.addAll(indexer.mapList);
messageCount += indexer.messageCount;
for(Entry entry : indexer.messageCountDistribution.entrySet()){
messageCountDistribution[(int) entry.getKey()] = (int) entry.getValue();
}
}
indexers.clear();
System.gc();
mapList.trimToSize();
HashSet<String> vocabulary = getVocabulary(mapList);
ArrayList<String> vocabularyList = new ArrayList<>();
vocabularyList.addAll(vocabulary);
vocabularyList.trimToSize();
vocabulary.clear();
int numberOfWordsPerThread = vocabularyList.size()/numberOfThreads;
HashSet<String> newVocabulary = new HashSet<>();
LinkedList<Analyzer> analyzers = new LinkedList<>();
for(int i = 0; i < numberOfThreads; i++){
int upperBound = (i==numberOfThreads-1)?vocabularyList.size()-1:numberOfWordsPerThread*(i+1);
analyzers.add(new Analyzer(i,numberOfWordsPerThread*i+1,upperBound,mapList,vocabularyList,fileCount));
analyzers.get(i).start();
}
for(Analyzer analyzer : analyzers){
analyzer.join();
}
for(Analyzer analyzer : analyzers){
newVocabulary.addAll(analyzer.newVocabulary);
}
analyzers.clear();
ArrayList<String> newVocabularyList = new ArrayList<>();
newVocabularyList.addAll(newVocabulary);
newVocabularyList.trimToSize();
int newVocabularySize = newVocabularyList.size();
newVocabulary.clear();
short[][] frequencyMatrix = new short[newVocabularySize][fileCount];
for(int i = 0; i < newVocabularySize; i++){
String word = newVocabularyList.get(i);
for(int j = 0; j < fileCount-1; j++){
Short count = mapList.get(j).get(word);
count = (count==null)?0:count;
frequencyMatrix[i][j] = count;
}
}
directory = (directory.endsWith("/"))?directory:directory+"/";
directory += "indexes/";
File dir = new File(directory);
if(!dir.exists()){
dir.mkdir();
}
String matrixFilename = (mention)?"mentionFrequencyMatrix.dat":"frequencyMatrix.dat";
FileOutputStream fosMatrix = new FileOutputStream(directory+matrixFilename);
ObjectOutputStream oosMatrix = new ObjectOutputStream(fosMatrix);
oosMatrix.writeObject(frequencyMatrix);
String vocabularyFilename = (mention)?"mentionVocabulary.dat":"vocabulary.dat";
FileOutputStream fosVocabulary = new FileOutputStream(directory+vocabularyFilename);
ObjectOutputStream oosVocabulary = new ObjectOutputStream(fosVocabulary);
oosVocabulary.writeObject(newVocabularyList);
String messageCountFilename = (mention)?"messageCountMention.txt":"messageCount.txt";
FileUtils.write(new File(directory+messageCountFilename), messageCount+"");
String messageCountDistributionFilename = (mention)?"messageMentionCountDistribution.dat":"messageCountDistribution.dat";
FileOutputStream fosDistribution = new FileOutputStream(directory+messageCountDistributionFilename);
ObjectOutputStream oosDistribution = new ObjectOutputStream(fosDistribution);
oosDistribution.writeObject(messageCountDistribution);
}
public static HashSet<String> getVocabulary(List<HashMap<String,Short>> mapList){
HashSet<String> vocabulary = new HashSet<>();
for(HashMap<String,Short> map : mapList){
for(String string : map.keySet()){
vocabulary.add(string);
}
}
return vocabulary;
}
}