/** * */ package context.core.task.parsetree; import context.core.entity.CorpusData; import context.core.entity.FileData; import context.core.entity.TabularData; import context.core.textnets.Network; import context.core.textnets.Network.FileType; import context.core.tokenizer.CustomEdge; import context.core.tokenizer.SemanticAnnotation; import context.core.util.JavaIO; import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; /** * @author Shubhanshu * */ public class SyntaxDeepCorpus { /** * */ private ParseTreeNetworkTaskInstance instance; private CorpusData input; private List<TabularData> tabularOutput; private HashMap<String, EdgeStream> streams; private boolean docLevel; private String tabularOutPath = ""; private Set<String> selectedTypes; private boolean advanced; /** * * @param instance */ public SyntaxDeepCorpus(ParseTreeNetworkTaskInstance instance) { // TODO Auto-generated constructor stub super(); this.instance = instance; this.input = (CorpusData) instance.getInput(); this.tabularOutput = instance.getTabularOutput(); this.streams = new HashMap<>(); this.selectedTypes = instance.getSelectedTypes(); this.advanced = instance.isAdvance(); if (instance.getAggregation() == 0) { // 0 - per document 1- per corpus this.setDocLevel(true); } else { this.setDocLevel(false); } // this.setUnit(UNITOFANALYSIS.SENTENCE); // this.setWindowSize(Integer.MAX_VALUE); // this.setFilterLabels(instance.getFilterLabels()); // System.err.println("Window Size: " + instance.getDistance() + ", " + this.getWindowSize()); // this.setDocLevel(false); } /** * * @param docLevel */ public void setDocLevel(boolean docLevel) { this.docLevel = docLevel; } /** * * @param fileName * @param es */ public void addStream(String fileName, EdgeStream es) { if (streams == null) { streams = new HashMap<>(); } this.streams.put(fileName, es); } /** * @return the docLevel */ public boolean isDocLevel() { return docLevel; } /** * */ public void genStreamsFromCorpus() { System.err.println("Adding Text Streams: " + input.getFiles().size()); List<FileData> files = input.getFiles(); for (FileData ff : files) { System.out.println("Reading file: " + ff.getFile().getAbsolutePath()); File file = ff.getFile(); String text = ""; try { text = JavaIO.readFile(file); } catch (IOException e) { e.printStackTrace(); } text = text.replaceAll("\\p{Cc}", " "); text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " "); String fileName = ff.getName().getValue(); final EdgeStream edgestream = getEdges(fileName, text); this.addStream(fileName, edgestream); System.err.println("Finished adding Text Stream: " + fileName); } } /** * * @param fileName * @param inText * @return */ public EdgeStream getEdges(String fileName, String inText) { EdgeStream estr = new EdgeStream(fileName); Map<String, CustomEdge> tokens = null; if (advanced) { tokens = SemanticAnnotation.tokenize(inText, fileName); for (String key : tokens.keySet()) { final CustomEdge cedge = tokens.get(key); //TODO: more filtering here if needed. String type = cedge.getType(); // cedge.setWord1(cedge.getWord1().toLowerCase()); // cedge.setWord2(cedge.getWord2().toLowerCase()); String prefix_type = getPrefix(type); if (this.selectedTypes.contains(prefix_type)) { estr.addEdge(cedge); } } } else { tokens = SemanticAnnotation.tokenizeSPO(inText, fileName); for (String key : tokens.keySet()) { final CustomEdge cedge = tokens.get(key); // cedge.setWord1(cedge.getWord1().toLowerCase()); // cedge.setWord2(cedge.getWord2().toLowerCase()); estr.addEdge(cedge); } } return estr; } /** * * @param outputDir */ public void saveNetworks(String outputDir) { String fileName = "SemanticNetwork"; saveNetworks(fileName, outputDir, FileType.GRAPHML); } /** * * @param fileName * @param outputDir * @param ft */ public void saveNetworks(String fileName, String outputDir, FileType ft) { if (docLevel) { for (String key : streams.keySet()) { Network net = new Network(); net.setEdgeTablePath(this.tabularOutPath + "_" + key + ".csv"); EdgeStream t = streams.get(key); t.makeNetwork(net); net.saveNet(fileName + "_" + key, outputDir, ft); } } else { Network net = new Network(); net.setEdgeTablePath(this.tabularOutPath); for (String key : streams.keySet()) { EdgeStream t = streams.get(key); t.makeNetwork(net); } net.saveNet(fileName, outputDir, ft); } } /** * @return the tabularOutPath */ public synchronized String getTabularOutPath() { return tabularOutPath; } /** * @param tabularOutPath the tabularOutPath to set */ public synchronized void setTabularOutPath(String tabularOutPath) { this.tabularOutPath = tabularOutPath; } private static String getPrefix(String type) { int lastUnderlineIndex = type.lastIndexOf("_"); if (lastUnderlineIndex == -1) { return type; } else { return type.substring(0, lastUnderlineIndex); } } }