package ivory.lsh.driver;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import ivory.core.RetrievalEnvironment;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapred.JobConf;
/**
* Runner class for pairwise similarity algorithms.
*
* @author ferhanture
*
*/
public abstract class PwsimEnvironment extends Configured {
public static final boolean cluster = true; //set this parameter true if working on cluster
public static boolean isCrossLingual;
//Signatures
public static String permutationType; // how to permute signatures: by bits ("bit") or by blocks ("block")
public static String signatureType; // the type of signature: "simhash", "minhash", or "random"
public static int numOfPermutations; // number of permutation tables (i.e., Q : number of permutations to use in randomized pwsim algorithm)
public static int numOfBits; // number of bits on the signature
//Similarity
public static int maxHammingDistance; // maximum allowable distance for similarity
//Eval
public static int sampleSize = -1;
public static int numResults = -1;
public static String mode;
//Sliding Window
public static int slidingWindowSize; // window size of similarity comparisons (i.e., B: beam size parameter for randomized pwsim algorithm)
public static int chunkOverlapSize; // size of overlap between chunks for sliding window algorithm
public static int numChunksPerPermTable = 10; // chunks per permutation table
public static boolean pairwiseWithinChunk = false; // do pairwise comparisons within each chunk
//Batch
public static int batchIndexKeyLength; // length of indexing key in bits for batch and hybrid algorithms
public static int numBatchFiles; // number of batch files (i.e., MapReduce jobs) for batch algorithm
//Partitioned Duplicate Detection
public static boolean withBoundaries = false; // boundaries will be considered or not
public static int dfCut = 100000;
public static float scoreThreshold = 0.5f;
public static void setClassTypes(String signatureType, Configuration config){
if(signatureType.toLowerCase().equals("random")){
config.set("Ivory.SignatureClass", "ivory.lsh.data.NBitSignature");
config.set("Ivory.PairClass", "ivory.lsh.data.PairOfIntNBitSignature");
config.set("Type", "Random");
}else if(signatureType.toLowerCase().equals("simhash")){
config.set("Ivory.SignatureClass", "ivory.lsh.data.SixtyFourBitSignature");
config.set("Ivory.PairClass", "ivory.lsh.data.PairOfInt64BitSignature");
config.set("Type", "Simhash");
}else if(signatureType.toLowerCase().equals("minhash")){
config.set("Ivory.SignatureClass", "ivory.lsh.data.MinhashSignature");
config.set("Ivory.PairClass", "ivory.lsh.data.PairOfIntMinhashSignature");
config.set("Type", "Minhash");
}else{
throw new RuntimeException("Error: Unknown signature type.");
}
}
public static String getTermDocvectorsFile (String dir, FileSystem fs) throws IOException {
RetrievalEnvironment env = new RetrievalEnvironment(dir, fs);
return env.getWeightedTermDocVectorsDirectory();
}
public static String getIntDocvectorsFile (String dir, FileSystem fs) throws IOException {
RetrievalEnvironment env = new RetrievalEnvironment(dir, fs);
return env.getWeightedIntDocVectorsDirectory();
}
public static String getIntDocvectorsFile (String dir, FileSystem fs, int sampleSize) throws IOException {
RetrievalEnvironment env = new RetrievalEnvironment(dir, fs);
String s = env.getWeightedIntDocVectorsDirectory();
return s.substring(0, s.length()-1)+"_sample="+sampleSize;
}
public static String getTermDocvectorsFile (String dir, FileSystem fs, int sampleSize) throws IOException {
RetrievalEnvironment env = new RetrievalEnvironment(dir, fs);
String s = env.getWeightedTermDocVectorsDirectory();
return s.substring(0, s.length()-1)+"_sample="+sampleSize;
}
public static String getPermutationsFile (String dir, FileSystem fs, int numOfBits, int numOfPermutations) throws IOException {
return dir + "/random-perms-bit_D=" + numOfBits + "_Q=" + numOfPermutations;
}
public static String getTablesDir (String dir, FileSystem fs, String signatureType, int numOfBits, int chunkOverlapSize, int numOfPermutations) throws IOException {
return dir + "/tables_" + signatureType + "_D=" + numOfBits + "_V=" + chunkOverlapSize + "_Q=" + numOfPermutations;
}
public static String getPwsimDir (String dir, String signatureType, int maxHammingDistance, int numOfBits, int numOfPermutations, int slidingWindowSize) throws IOException {
return dir + "/similardocs_" + signatureType + "_maxdst=" + maxHammingDistance + "_D=" + numOfBits + "_Q=" + numOfPermutations + "_B=" + slidingWindowSize;
}
public static String getFilteredPwsimDir (String dir, String signatureType, int maxHammingDistance, int numOfBits, int numOfPermutations, int slidingWindowSize, String docnos, int numResults) throws IOException {
return dir + "/similardocs_" + signatureType + "_maxdst=" + maxHammingDistance + "_D=" + numOfBits + "_Q=" + numOfPermutations + "_B=" + slidingWindowSize + "-filtered_sample=" + docnos.substring(docnos.lastIndexOf("/") + 1) + "_top" + numResults;
}
public static String getRandomVectorsDir (String dir, int numOfBits) {
return dir + "/randomvectors_D="+numOfBits;
}
public static String getSignaturesDir (String dir, int numOfBits, String type) {
return dir + "/signatures-" + type + "_D=" + numOfBits;
}
public static String getSignaturesDir (String dir, int numOfBits, String type, int numBatch) {
return dir + "/signatures-" + type + "_D=" + numOfBits + "_batch=" + numBatch;
}
public static JobConf setBitextPaths(JobConf conf, String dataDir, String eLang, String fLang, String bitextName, String eDir, String fDir) throws IOException, URISyntaxException {
String eSentDetect = dataDir+"/sent/"+eLang+"-sent.bin";
String eTokenizer = dataDir+"/token/"+eLang+"-token.bin";
String eVocabSrc = dataDir+"/"+bitextName+"/vocab."+eLang+"-"+fLang+"."+eLang;
String eStopwords = dataDir+"/token/"+eLang+".stop";
String eVocabTrg = dataDir+"/"+bitextName+"/vocab."+fLang+"-"+eLang+"."+eLang;
String eStemmedStopwords = dataDir+"/token/"+eLang+".stop.stemmed";
String fSentDetect = dataDir+"/sent/"+fLang+"-sent.bin";
String fTokenizer = dataDir+"/token/"+fLang+"-token.bin";
String fVocabSrc = dataDir+"/"+bitextName+"/vocab."+fLang+"-"+eLang+"."+fLang;
String fStopwords = dataDir+"/token/"+fLang+".stop";
String fVocabTrg = dataDir+"/"+bitextName+"/vocab."+eLang+"-"+fLang+"."+fLang;
String fStemmedStopwords = dataDir+"/token/"+fLang+".stop.stemmed";
String f2e_ttableFile = dataDir+"/"+bitextName+"/ttable."+fLang+"-"+eLang;
String e2f_ttableFile = dataDir+"/"+bitextName+"/ttable."+eLang+"-"+fLang;
conf.set("eDir", eDir);
conf.set("fDir", fDir);
conf.set("eLang", eLang);
conf.set("fLang", fLang);
conf.set("fTokenizer", fTokenizer);
conf.set("eTokenizer", eTokenizer);
conf.set("eStopword", eStopwords);
conf.set("fStopword", fStopwords);
conf.set("eStemmedStopword", eStemmedStopwords);
conf.set("fStemmedStopword", fStemmedStopwords);
//e-files
RetrievalEnvironment eEnv = new RetrievalEnvironment(eDir, FileSystem.get(conf));
DistributedCache.addCacheFile(new URI(eEnv.getDfByTermData()), conf);
DistributedCache.addCacheFile(new URI(eSentDetect), conf);
DistributedCache.addCacheFile(new URI(eTokenizer), conf);
DistributedCache.addCacheFile(new URI(eVocabSrc), conf);
DistributedCache.addCacheFile(new URI(eVocabTrg), conf);
//f-files
// DistributedCache.addCacheFile(new URI(fDir+"/transDf.dat"), conf);
DistributedCache.addCacheFile(new URI(fSentDetect), conf);
DistributedCache.addCacheFile(new URI(fTokenizer), conf);
DistributedCache.addCacheFile(new URI(fVocabSrc), conf);
DistributedCache.addCacheFile(new URI(fVocabTrg), conf);
/////cross-lang files
DistributedCache.addCacheFile(new URI(f2e_ttableFile), conf);
DistributedCache.addCacheFile(new URI(e2f_ttableFile), conf);
DistributedCache.addCacheFile(new URI(eEnv.getIndexTermsData()), conf);
return conf;
}
public static JobConf setBitextPaths(JobConf conf, String dataDir, String eLang, String fLang, String bitextName, String eDir, String fDir, float classifierThreshold, int classifierId, String pwsimPairsPath, String classifierType) throws IOException, URISyntaxException {
conf = setBitextPaths(conf, dataDir, eLang, fLang, bitextName, eDir, fDir);
String classifierFile = dataDir+"/"+bitextName+"/classifier-" + classifierType + "." + fLang + "-" + eLang;
DistributedCache.addCacheFile(new URI(classifierFile), conf);
conf.setFloat("ClassifierThreshold", classifierThreshold);
conf.setInt("ClassifierId", classifierId);
if (pwsimPairsPath != null) {
conf.set("PwsimPairs", pwsimPairsPath);
DistributedCache.addCacheFile(new URI(pwsimPairsPath), conf);
}
return conf;
}
// public static String getFileNameWithPars(String dir, String fileName) throws Exception{
// return getFileNameWithPars(dir, fileName, FileSystem.get(new Configuration()));
// }
// public static String getFileNameWithPars(String dir, String fileName, FileSystem fs) throws Exception{
// RetrievalEnvironment env = new RetrievalEnvironment(dir, fs);
// }else if(fileName.equals("SampleDocnos")){
// return dir + "/sample-docnos_"+sampleSize;
// }else if(fileName.equals("SignaturesSimhash")){
// if(numBatchFiles>0){
// return dir + "/signatures-simhash_D="+numOfBits+"_batch="+numBatchFiles;
// }else{
// return dir + "/signatures-simhash_D="+numOfBits;
// }
// }else if(fileName.equals("SignaturesMinhash")){
// if(numBatchFiles>0){
// return dir + "/signatures-minhash_D="+numOfBits+"_batch="+numBatchFiles;
// }else{
// return dir + "/signatures-minhash_D="+numOfBits;
// }
// }else if(fileName.equals("SignaturesRandom")){
// if(numBatchFiles>0){
// return dir + "/signatures-random_D="+numOfBits+"_batch="+numBatchFiles;
// }else{
// return dir + "/signatures-random_D="+numOfBits;
// }
// }else if(fileName.equals("Signatures")){
// if(numBatchFiles>0){
// return dir + "/signatures-"+ signatureType +"_D="+numOfBits+"_batch="+numBatchFiles;
// }else{
// return dir + "/signatures-"+ signatureType +"_D="+numOfBits;
// }
// }else if(fileName.equals("SignaturesIndexable")){
// return dir + "/signatures-random-indx_D="+numOfBits;
// }else if(fileName.equals("P-SignaturesSimhash")){
// if(numBatchFiles>0){
// return dir + (withBoundaries? "/p-b":"/p")+"-signatures-simhash_D="+numOfBits+"_batch="+numBatchFiles;
// }else{
// return dir + (withBoundaries? "/p-b":"/p")+"-signatures-simhash_D="+numOfBits;
// }
// }else if(fileName.equals("P-SignaturesMinhash")){
// if(numBatchFiles>0){
// return dir + (withBoundaries? "/p-b":"/p")+"-signatures-minhash_D="+numOfBits+"_batch="+numBatchFiles;
// }else{
// return dir + (withBoundaries? "/p-b":"/p")+"-signatures-minhash_D="+numOfBits;
// }
// }else if(fileName.equals("P-SignaturesRandom")){
// if(numBatchFiles>0){
// return dir + (withBoundaries? "/p-b":"/p")+"-signatures-random_D="+numOfBits+"_batch="+numBatchFiles;
// }else{
// return dir + (withBoundaries? "/p-b":"/p")+"-signatures-random_D="+numOfBits;
// }
// }else if(fileName.equals("BatchSignaturesMap")){
// if(numBatchFiles>0){
// return dir + "/map-signatures-"+signatureType+"_L="+batchIndexKeyLength+"_Q="+numOfPermutations+"_D="+numOfBits+"_batch="+numBatchFiles;
// }else{
// return dir + "/map-signatures-"+signatureType+"_L="+batchIndexKeyLength+"_Q="+numOfPermutations+"_D="+numOfBits;
// }
// /*}else if(fileName.equals("P-Bnd-BatchSignaturesMap")){
// return dir + "/p-bnd-map-signatures-"+signatureType+"_L="+batchIndexKeyLength+"_Q="+numOfPermutations+"_D="+numOfBits;*/
// }else if(fileName.equals("PWSim")){
// return dir + "/similardocs_D="+numOfBits+"_Q="+numOfPermutations+"_B="+slidingWindowSize; //for backward compatibility. may remove later.
//
// }else if(fileName.equals("PWSimCollectionFiltered")){
// if(batchIndexKeyLength>0)
// return dir + "/similardocs_hybrid_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+"_B="+slidingWindowSize+"_L="+batchIndexKeyLength+(pairwiseWithinChunk?"_pw":"")+"-filtered_top"+numResults;
// else
// return dir + "/similardocs_coll_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+"_B="+slidingWindowSize+"-filtered_top"+numResults;
//
// }else if(fileName.equals("P-PWSimCollection")){
// if(batchIndexKeyLength>0)
// return dir + (withBoundaries? "/p-b":"/p")+ "-similardocs_hybrid_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+"_B="+slidingWindowSize+"_L="+batchIndexKeyLength+(pairwiseWithinChunk?"_pw":"");
// else
// return dir + (withBoundaries? "/p-b":"/p")+"-similardocs_coll_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+"_B="+slidingWindowSize;
//
// }else if(fileName.equals("PCP")){
// return dir + "/pcp-dfcut=" + dfCut+"_Th="+scoreThreshold;
//
// }else if(fileName.equals("EvaluateGolden")){
// if(chunkOverlapSize > 0)
// return dir + "/eval_similardocs_coll_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+((batchIndexKeyLength>0)?("_L="+batchIndexKeyLength):("_B="+slidingWindowSize));
// else
// return dir + "/eval_similardocs_"+((numBatchFiles>0)?"batch":"hybrid")+"_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+((batchIndexKeyLength>0)?("_B="+slidingWindowSize+"_L="+batchIndexKeyLength):"")+((numBatchFiles==0 && pairwiseWithinChunk)?"_pw":"");
//
// }else if(fileName.equals("PCP-EvaluateGolden")){
// if(chunkOverlapSize > 0)
// return dir + "/eval-pcp-dfcut=" + dfCut+"_Th="+scoreThreshold;
// else
// return dir + "/eval_similardocs_"+((numBatchFiles>0)?"batch":"hybrid")+"_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+((batchIndexKeyLength>0)?("_B="+slidingWindowSize+"_L="+batchIndexKeyLength):"")+((numBatchFiles==0 && pairwiseWithinChunk)?"_pw":"");
//
// }else if(fileName.equals("U-EvaluateGolden")){
// if(chunkOverlapSize > 0)
// return dir + (withBoundaries? "/u-p-b":"/u-p")+"-eval_similardocs_coll_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+((batchIndexKeyLength>0)?("_L="+batchIndexKeyLength):("_B="+slidingWindowSize));
// else
// return dir + (withBoundaries? "/u-p-b":"/u-p")+"-eval_similardocs_"+((numBatchFiles>0)?"batch":"hybrid")+"_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+((batchIndexKeyLength>0)?("_B="+slidingWindowSize+"_L="+batchIndexKeyLength):"")+((numBatchFiles==0 && pairwiseWithinChunk)?"_pw":"");
//
// }else if(fileName.equals("P-EvaluateGolden")){
// if(chunkOverlapSize > 0)
// return dir + (withBoundaries? "/p-b":"/p")+"-eval_similardocs_coll_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+((batchIndexKeyLength>0)?("_L="+batchIndexKeyLength):("_B="+slidingWindowSize));
// else
// return dir + (withBoundaries? "/p-b":"/p")+"-eval_similardocs_"+((numBatchFiles>0)?"batch":"hybrid")+"_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+((batchIndexKeyLength>0)?("_B="+slidingWindowSize+"_L="+batchIndexKeyLength):"")+((numBatchFiles==0 && pairwiseWithinChunk)?"_pw":"");
//
// }else if(fileName.equals("PWSimBatch")){
// return dir + "/similardocs_batch_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+"_L="+batchIndexKeyLength;
//
// }else if(fileName.equals("P-Bnd-PWSimBatch")){
// return dir + "/p-bnd-similardocs_batch_"+signatureType+"_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+"_L="+batchIndexKeyLength;
//
// }else if(fileName.equals("Tables")){
// if(batchIndexKeyLength>0){
// return dir + "/tablesIndexed_"+signatureType+"_D="+numOfBits+"_keylen="+batchIndexKeyLength+"_Q="+numOfPermutations;
// }else{
// return dir + "/tables_"+signatureType+"_D="+numOfBits+"_V="+chunkOverlapSize+"_Q="+numOfPermutations;
// }
//
// }else if(fileName.equals("P-Tables")){
// if(batchIndexKeyLength>0){
// return dir + (withBoundaries? "/p-b":"/p")+ "-tablesIndexed_"+signatureType+"_D="+numOfBits+"_keylen="+batchIndexKeyLength+"_Q="+numOfPermutations;
// }else{
// return dir + (withBoundaries? "/p-b":"/p")+ "-tables_"+signatureType+"_D="+numOfBits+"_V="+chunkOverlapSize+"_Q="+numOfPermutations;
// }
//
// }else if(fileName.equals("Permsbit")){
// return dir + "/random-perms-bit"+"_D="+numOfBits+"_Q="+numOfPermutations;
//
// }else if(fileName.equals("Permsblk")){
// return dir + "/random-perms-blk"+"_D="+numOfBits+"_Q="+numOfPermutations;
//
// }else if(fileName.equals("EvalSeeAlso")){
// return dir + "/eval-seealso_coll_perm=bit_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+"_B="+slidingWindowSize;
//
// }else if(fileName.equals("DuplicateSets")){
// return dir + "/sets_similardocs_coll_perm=bit_maxdst="+maxHammingDistance+"_D="+numOfBits+"_Q="+numOfPermutations+"_B="+slidingWindowSize;
// }
//
// else{
// throw new RuntimeException("Could not process file name: "+fileName);
// }
// }
}