package rainbownlp.preprocess;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import rainbownlp.core.Artifact;
import rainbownlp.util.FileUtil;
import rainbownlp.util.HibernateUtil;
import edu.stanford.nlp.process.PTBTokenizer;
public class SimpleDocumentLoader extends DocumentAnalyzer {
protected List<Artifact> documents;
protected String documentExtension = "txt";
public List<Artifact> getDocuments() {
return documents;
}
/**
* Simply call loadSentences, override this method for more complex loaders
* @param doc
* @throws IOException
* @throws Exception
*/
protected void processDocument(Artifact doc) throws Exception{
loadSentences(doc);
}
/**
* Use tokenizer to find sentences
* @param parentArtifact can be document, paragraph or any other document section
* @throws IOException
*/
protected void loadSentences(Artifact parentArtifact) throws IOException {
Tokenizer docTokenizer = new Tokenizer(parentArtifact.getAssociatedFilePath());
HashMap<Integer, String> setences = docTokenizer.getSentences();
List<Artifact> setencesArtifacts = new ArrayList<Artifact>();
Artifact previous_sentence = null;
for(int curSentenceIndex=0;
curSentenceIndex<setences.keySet().size();curSentenceIndex++){
// System.out.print("\r Loading sentences for: "+parentDoc.get_associatedFilePath()+ " "+
// curSentenceIndex + "/ " + setences.size()+longspace);
String tokenizedSentence = setences.get(curSentenceIndex);
List<Integer> tokens_starts =
docTokenizer.sentences_tokens_indexes.get(curSentenceIndex);
List<String> tokens =
docTokenizer.sentences_tokens_string.get(curSentenceIndex);
if(tokens.size()==0)
continue;
Artifact new_sentence = Artifact.getInstance(Artifact.Type.Sentence,
parentArtifact.getAssociatedFilePath(), tokens_starts.get(0));//line number start from 1
new_sentence.setParentArtifact(parentArtifact);
new_sentence.setLineIndex(curSentenceIndex+1);
new_sentence.setContent(tokenizedSentence);
new_sentence.setArtifactOptionalCategory(dsType.name());
if (previous_sentence != null) {
new_sentence.setPreviousArtifact(previous_sentence);
previous_sentence.setNextArtifact(new_sentence);
HibernateUtil.save(previous_sentence);
}
HibernateUtil.save(new_sentence);
loadWords(new_sentence,tokens, tokens_starts, curSentenceIndex);
setencesArtifacts.add(new_sentence);
//Pattern.insert(Util.getSentencePOSsPattern(curSentence));
previous_sentence = new_sentence;
HibernateUtil.clearLoaderSession();
}
}
protected void loadWords(Artifact parentSentence, List<String> tokens,
List<Integer> starts, int parentOffset) {
List<Artifact> tokensArtifacts = new ArrayList<Artifact>();
Artifact previous_word = null;
int tokens_count = tokens.size();
//save POS
String textContent = "";
Artifact new_word = null;
// Util.log(""+tokens_count, 1);
for(int curTokenIndex=0;
curTokenIndex<tokens_count;curTokenIndex++){
textContent =
PTBTokenizer.ptbToken2Text(tokens.get(curTokenIndex));
new_word = Artifact.getInstance(
Artifact.Type.Word,
parentSentence.getAssociatedFilePath(), starts.get(curTokenIndex));
new_word.setContent(textContent);
new_word.setParentArtifact(parentSentence);
new_word.setLineIndex(parentOffset+1);
new_word.setWordIndex(curTokenIndex);
new_word.setArtifactOptionalCategory(dsType.name());
if (previous_word != null) {
new_word.setPreviousArtifact(previous_word);
previous_word.setNextArtifact(new_word);
HibernateUtil.save(previous_word);
}
HibernateUtil.save(new_word);
tokensArtifacts.add(new_word);
previous_word = new_word;
}
}
@Override
/**
* Create document(s) artifact(s) and call processDocument for each document
* @param filesRoot
*/
public List<Artifact> processDocuments(String rootPath) {
File f = new File(rootPath);
List<Artifact> loaded_documents = new ArrayList<Artifact>();
if (f.exists() && f.isFile()) {
//Util.log("Loading document :"+filesRoot, Level.INFO);
//Util.generateParseFilesIfnotExist(filesRoot);
Artifact new_doc = Artifact.getInstance(Artifact.Type.Document, rootPath, 0);
new_doc.setArtifactOptionalCategory(dsType.name());
HibernateUtil.save(new_doc);
try {
processDocument(new_doc);
} catch (Exception e) {
e.printStackTrace();
}
loaded_documents.add(new_doc);
} else {
List<File> files =
FileUtil.getFilesInDirectory(rootPath, documentExtension);
for(File file : files) {
Artifact new_doc =
Artifact.getInstance(Artifact.Type.Document, file.getAbsolutePath(), 0);
new_doc.setArtifactOptionalCategory(dsType.name());
HibernateUtil.save(new_doc);
loaded_documents.add(new_doc);
}
for(Artifact doc:loaded_documents){
System.out.print("\nLoading document: " + doc.getAssociatedFilePath());
try {
processDocument(doc);
} catch (Exception e) {
e.printStackTrace();
}
}
}
this.documents = loaded_documents;
return this.documents;
}
}