package lda.wikievidence.modelcreation; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import lda.wikievidence.modelcreation.WikipediaLDAThreadExtractEvidenceTerms.Output; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import com.google.gson.Gson; public class MineEvidences extends LDAExecutor { public static final String CIRCLES = "/home/zwicklbauer/circles.dat"; public static final String EVIDENCEDIR = "/home/hduser/EvidenceMining/evidence"; public static final String HADOOPINPUT = "/home/zwicklbauer/ldadata.dat"; public static final String LUCENEINDEX = "/home/zwicklbauer/MMapLuceneIndexStandard/"; public static final int RANDOMDOCUMENTS = 1000; public static final int MODDOCUMENTS = 6; public static final int POOLSIZE = 26; private HashSet<String> indexStrings; private Map<Integer, Output> hashmap; private Map<String, String> fileHashMap; private Set<String> fileSet; public MineEvidences() { super(); this.indexStrings = new HashSet<String>(); this.hashmap = new HashMap<Integer, Output>(); this.fileSet = new HashSet<String>(); this.fileHashMap = new HashMap<String, String>(); } public void initializeAvailableFiles() { File dir = new File(EVIDENCEDIR); String files[] = dir.list(); for(String file : files) { fileSet.add(file); } } public void fillHashMap() { File indexDirectory = new File(LUCENEINDEX); IndexReader reader = null; try { Directory dir = FSDirectory.open(indexDirectory); reader = DirectoryReader.open(dir); int docs = reader.maxDoc(); for (int i = 0; i < docs; i++) { Document doc = reader.document(i); String s = doc.get("Mainlink"); s = s.replaceAll("http://dbpedia.org/resource/", ""); s = URLDecoder.decode(s, "UTF-8"); s = URLEncoder.encode(s, "UTF-8"); indexStrings.add(s); } } catch (IOException e) { e.printStackTrace(); } } public void start() { File file = new File(CIRCLES); try { BufferedReader reader = new BufferedReader(new FileReader(file)); List<LDAClient> lst = new LinkedList<LDAClient>(); String line = null; while ((line = reader.readLine()) != null) { String[] split = line.split("\\|"); for (int i = 0; i < split.length; i++) { split[i] = split[i].replaceAll(".html", ""); } if (lst.size() == POOLSIZE) { executeThreadPool(lst); lst.clear(); } if (indexStrings.contains(split[0]) && !fileSet.contains(split[0])) { lst.add(new WikipediaLDAThreadExtractEvidenceTerms(lst.size(), 50, hashmap, split, fileHashMap)); } } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void initializeData() { File file = new File(HADOOPINPUT); Gson gson = new Gson(); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(file)); String line = null; int lineNr = 0; int entries = 0; while ((line = reader.readLine()) != null && lineNr < 1000000) { if ((lineNr % MODDOCUMENTS) == 0) { Output out = gson.fromJson(line, Output.class); hashmap.put(entries, out); entries++; } lineNr++; } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } public void initializeEntityFiles() { File dir = new File(WikipediaLDAThreadExtractEvidenceTerms.WIKIPEDIAPAGESDIR); String s[] = dir.list(); for (int i = 0; i < s.length; i++) { String str = s[i].replaceAll(".html", ""); str = str.replaceAll("'", "%"); try { str = URLDecoder.decode(str, "UTF-8"); str = URLEncoder.encode(str, "UTF-8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } // System.out.println(str+" "+s[i]); fileHashMap.put(str, s[i]); } } public static void main(String[] args) { MineEvidences evidence = new MineEvidences(); evidence.initializeEntityFiles(); System.out.println("First"); evidence.initializeAvailableFiles(); System.out.println("Second"); evidence.fillHashMap(); System.out.println("Third"); evidence.initializeData(); System.out.println("Fourth"); evidence.start(); } }