package lda.wikievidence.modelcreation; import hbase.operations.HBaseOperations; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.io.StringReader; import java.util.Arrays; import java.util.HashSet; import java.util.Map; import java.util.Random; import java.util.Set; import org.apache.log4j.Logger; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; public class WikipediaLDAThreadExtractEvidenceTerms extends LDAClient { public static final int RANDOMDOCUMENTS = 1000; public static final String WIKIPEDIAPAGESDIR = "/mnt/storage/zwicklbauer/WikiParse/temp/plain_reduced/"; private String lastIterationPath; private String evidenceFilePath; private int topTerms; private String[] evidence; private Map<Integer, Output> hashmap; private byte[] dataset; private String[] circleEnts; private Map<String, String> realFileNames; public WikipediaLDAThreadExtractEvidenceTerms(int threadnr, int topTerms, Map<Integer, Output> map, String[] circleEnts, Map<String, String> realFileNames) { super(threadnr); this.topTerms = topTerms; this.lastIterationPath = modeloutputPath + "00025/"; this.evidenceFilePath = lastIterationPath + "summary.txt"; this.hashmap = map; this.circleEnts = circleEnts; this.realFileNames = realFileNames; } @Override public void run() { // Create LDA Configuration File byte[] config = ConfigCreation.createEvidenceExtractionConfig( datafilePath, modeloutputPath, topTerms); writeOutput(config, configPath); this.dataset = createDataFile(circleEnts); // Create Datafile writeOutput(this.dataset, datafilePath); // Execute LDA try { Process proc = Runtime.getRuntime().exec( "java -jar tmt-assembly-0.4.0.jar " + configPath); proc.waitFor(); // Then retrieve the process output InputStream in = proc.getInputStream(); InputStream err = proc.getErrorStream(); byte b[] = new byte[in.available()]; in.read(b, 0, b.length); byte c[] = new byte[err.available()]; err.read(c, 0, c.length); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e1) { e1.printStackTrace(); } extractEvidence(); writeEvidence(this.evidence, this.circleEnts[0]); // Delete Thread Directory deleteDir(threadDir); } private void extractEvidence() { BufferedReader buffered = null; try { buffered = new BufferedReader(new FileReader(new File( evidenceFilePath))); int neccLine = extractTopicLine(lastIterationPath, circleEnts[0]); int it = 0; String line = null; while (it < neccLine) { line = buffered.readLine(); if (line.equalsIgnoreCase("")) { ++it; } } StringBuffer buff = new StringBuffer(); while ((line = buffered.readLine()) != null && !line.equalsIgnoreCase("")) { buff.append(line + ";"); } String ev = buff.toString(); String[] split = ev.split(";"); String[] vals = new String[split.length - 1]; for (int i = 1; i < split.length; i++) { vals[i - 1] = split[i]; } this.evidence = vals; } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (buffered != null) { try { buffered.close(); } catch (IOException e) { e.printStackTrace(); } } } } private void writeEvidence(String[] evidence, String entityName) { try { PrintWriter writer = new PrintWriter(new File( MineEvidences.EVIDENCEDIR + "/" + entityName)); for (int i = 0; i < evidence.length; i++) { writer.println(evidence[i]); } writer.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } } public byte[] createDataFile(String[] entities) { StringBuffer buffer = new StringBuffer(); // Write main file content writeMainFileContent(entities[0], buffer); // Write main content int entryNumber = createBasicData(entities, "LDADC_Context", buffer); // Write randomly selected documents Set<Output> set = selectRandomDocuments(RANDOMDOCUMENTS); for (Output o : set) { Set<String> setRows = new HashSet<String>(); try { HBaseOperations.getInstance().getRow("LDADC_Context", (o.getUrl() + ".html"), "data", setRows, 200); for (String s : setRows) { buffer.append(entryNumber++ + "," + o.getUrl() + ",\"" + s + "\""); buffer.append(System.lineSeparator()); } } catch (IOException e) { e.printStackTrace(); } if (entryNumber > 1000) { break; } } return buffer.toString().getBytes(); } private void writeMainFileContent(String c, StringBuffer buffer) { // System.out.println(c +" "+realFileNames.containsKey(c)); if (realFileNames.containsKey(c)) { File f = new File(WIKIPEDIAPAGESDIR + realFileNames.get(c)); String content = ""; try { BufferedReader reader = new BufferedReader(new FileReader(f)); String line = null; while ((line = reader.readLine()) != null) { content += line; } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } try { XMLReader xmlReader = XMLReaderFactory.createXMLReader(); Handler handler = new Handler(); InputSource inputSource = new InputSource(new StringReader( content)); xmlReader.setContentHandler(handler); xmlReader.parse(inputSource); buffer.append(1 + "," + c + ",\"" + handler.getDocumentText() + "\""); buffer.append(System.lineSeparator()); } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } else { buffer.append(1 + "," + c + ",\"\""); buffer.append(System.lineSeparator()); } } private Set<Output> selectRandomDocuments(int nrRandomDocs) { Set<Output> outputSet = new HashSet<Output>(); int[] docs = createRandomNrs(nrRandomDocs, (hashmap.size() - 1)); for (int i = 0; i < docs.length; i++) { outputSet.add(hashmap.get(docs[i])); } return outputSet; } private int[] createRandomNrs(int nrRandomDocs, int lines) { Random ran = new Random(); int[] res = new int[nrRandomDocs]; for (int i = 0; i < nrRandomDocs; i++) { res[i] = ran.nextInt(lines); } Arrays.sort(res); return res; } private int createBasicData(String[] entities, String hbaseTable, StringBuffer buffer) { int entryNumber = 2; for (int i = 0; i < entities.length; i++) { Set<String> set = new HashSet<String>(); try { HBaseOperations.getInstance().getRow(hbaseTable, (entities[i] + ".html"), "data", set, 300); } catch (IOException e) { Logger.getRootLogger().error("Error:", e); } for (String l : set) { buffer.append(entryNumber++ + "," + entities[i] + ",\"" + l + "\""); buffer.append(System.lineSeparator()); } } return entryNumber; } public class Output { private String url; private String content; private String mention; public String getUrl() { return url.replaceAll(".html", ""); } public void setUrl(String url) { this.url = url; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getMention() { return mention; } public void setMention(String mention) { this.mention = mention; } @Override public int hashCode() { return content.hashCode(); } @Override public boolean equals(Object obj) { return (this.content == ((Output) obj).getContent()); } } class Handler implements ContentHandler { private StringBuffer documentText; public Handler() { super(); this.documentText = new StringBuffer(); } @Override public void characters(char[] arg0, int arg1, int arg2) throws SAXException { this.documentText.append(new String(arg0, arg1, arg2)); } @Override public void endDocument() throws SAXException { } @Override public void endElement(String arg0, String arg1, String arg2) throws SAXException { } @Override public void endPrefixMapping(String arg0) throws SAXException { } @Override public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException { } @Override public void processingInstruction(String arg0, String arg1) throws SAXException { } @Override public void setDocumentLocator(Locator arg0) { } @Override public void skippedEntity(String arg0) throws SAXException { } @Override public void startDocument() throws SAXException { } @Override public void startElement(String arg0, String arg1, String arg2, Attributes arg3) throws SAXException { } @Override public void startPrefixMapping(String arg0, String arg1) throws SAXException { } public String getDocumentText() { return documentText.toString(); } } public String[] getEvidence() { return evidence; } public String getTopic() { return circleEnts[0]; } }