package rainbownlp.preprocess; import java.io.IOException; import java.io.StringReader; import java.sql.SQLException; import java.util.ArrayList; import java.util.HashMap; import java.util.Hashtable; import java.util.List; import rainbownlp.core.Artifact; import rainbownlp.util.FileUtil; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.process.TokenizerFactory; import edu.stanford.nlp.process.PTBTokenizer; public class Tokenizer { String txt_file_path; String tokenization_file; String original_txt_content = ""; String compressedText; public static void main(String[] args) throws SQLException { fixDashSplitted(); } static TokenizerFactory<Word> tf; public static List<Word> getTokens(String sentence) { if(tf == null) tf = PTBTokenizer.factory(); List<Word> tokens_words = tf.getTokenizer(new StringReader(sentence)).tokenize(); return tokens_words; } public static void fixDashSplitted() throws SQLException { // String q = "SELECT * from Artifact where text_content like '%-%' and artifact_type = 'Word'"; // ResultSet artifacts = Util.db.executeReader(q); // while(artifacts.next()) // { // int artifact_id = artifacts.getInt("artifact_id"); // Artifact curArti = new Artifact(artifact_id); // fixMergedNameEntity(curArti); // } } public ArrayList<String> paragraphs = new ArrayList<String>(); public HashMap<Integer, String> sentences = new HashMap<Integer, String>(); public HashMap<Integer,List<Integer>> sentences_tokens_indexes = new HashMap<Integer, List<Integer>>(); public HashMap<Integer,List<Word>> sentences_tokens = new HashMap<Integer,List<Word>>(); public Hashtable<Integer, List<String>> sentences_tokens_string = new Hashtable<Integer, List<String>>(); public static void fixMergedNameEntity(Artifact curArtifact) throws SQLException { // String originalContent = curArtifact.getTextContent(); // String[] parts = originalContent.split("-"); // String previousContent = ""; // for(int i=0;i<parts.length;i++) // { // String content = parts[i]; // int j=i; // do // { // if((NameEntityTable.possibleNameEntity(content)|| // EventTriggers.isPossibleTrigger(content)) // && !content.equals(originalContent) && // content.length()>1) // { // Util.log("Fixing:"+originalContent, Level.INFO); // if(i==0) // {//NE is at the beginning // //shorten current artifact // curArtifact.setTextContent(content); // //add a new artifact at the end // String remainingContent = ""; // for(int k=j+1;k<parts.length;k++){ // if(!remainingContent.equals("")) // remainingContent+= "-"; // remainingContent+=parts[k]; // } // // Artifact dashArtifact = new Artifact("-", // Type.Word, curArtifact.associatedFilePath, // curArtifact.getStartIndex()+content.length(), // curArtifact.getParentArtifact()); // if(remainingContent.equals("")) // { // dashArtifact.setNextArtifact(curArtifact.getNextArtifact()); // curArtifact.setNextArtifact(dashArtifact); // }else{ // Artifact neArtifact = new Artifact(remainingContent, // Type.Word, curArtifact.associatedFilePath, // curArtifact.getStartIndex()+content.length()+1, // curArtifact.getParentArtifact()); // neArtifact.setNextArtifact(curArtifact.getNextArtifact()); // curArtifact.setNextArtifact(dashArtifact); // dashArtifact.setNextArtifact(neArtifact); // } // // // } // if(parts.length>1 && // i==(parts.length-1)) // {//NE is at the end // //shorten current artifact // Artifact neArtifact = new Artifact(content, // Type.Word, curArtifact.associatedFilePath, // curArtifact.getStartIndex()+previousContent.length()+1, // curArtifact.getParentArtifact()); // // if(previousContent.equals("")) // { // curArtifact.setTextContent("-"); // neArtifact.setNextArtifact(curArtifact.getNextArtifact()); // curArtifact.setNextArtifact(neArtifact); // }else{ // curArtifact.setTextContent(previousContent); // //add a new artifact at the end // // Artifact dashArtifact = // new Artifact("-", Type.Word, curArtifact.associatedFilePath, // curArtifact.getStartIndex()+previousContent.length(), // curArtifact.getParentArtifact()); // // neArtifact.setNextArtifact(curArtifact.getNextArtifact()); // curArtifact.setNextArtifact(dashArtifact); // dashArtifact.setNextArtifact(neArtifact); // } // } // return; // } // j++; // if(j<parts.length) // content+="-"+parts[j]; // }while(j<parts.length); // if(!previousContent.equals("")) // previousContent+="-"; // previousContent += parts[i]; // } } public Tokenizer(String associatedFilePath) throws IOException { txt_file_path = associatedFilePath; tokenization_file = txt_file_path.replace(".txt", ".tok"); if(FileUtil.fileExists(tokenization_file)) processFileWithTokenization(tokenization_file); else processFile(); } public Tokenizer(){ } public void processLines(String[] sentencesToProcess, Integer[] sentencesStarts, int parentOffset) throws IOException{ for(int line_number = 0;line_number<sentencesToProcess.length;line_number++){ String line = sentencesToProcess[line_number]; List<Word> tokensInSentence = getTokens(line); ArrayList<Integer> tokens_indexes = new ArrayList<Integer>(); ArrayList<String> tokens = new ArrayList<String>(); for(int token_index = 0;token_index<tokensInSentence.size();token_index++) { tokens_indexes.add(tokensInSentence.get(token_index).beginPosition()+ sentencesStarts[line_number]+ parentOffset); tokens.add(tokensInSentence.get(token_index).word()); } sentences_tokens_indexes.put(line_number, tokens_indexes); sentences_tokens.put(line_number, tokensInSentence); sentences_tokens_string.put(line_number, tokens); sentences.put(line_number, line); } } public void processFile() throws IOException{ List<String> lines = FileUtil.loadLineByLine(txt_file_path); int sentence_start=0; for(int line_number = 0;line_number<lines.size();line_number++){ String line = lines.get(line_number); List<Word> tokensInSentence = getTokens(line); ArrayList<Integer> tokens_indexes = new ArrayList<Integer>(); for(int token_index = 0;token_index<tokensInSentence.size();token_index++) { tokens_indexes.add(tokensInSentence.get(token_index).beginPosition()+sentence_start+line_number+1); } sentences_tokens_indexes.put(line_number, tokens_indexes); sentences_tokens.put(line_number, tokensInSentence); sentences.put(line_number, line); sentence_start+= line.length(); } } public void processFileWithTokenization(String tokenizationFilePath) throws IOException{ List<String> lines_tokenized = FileUtil.loadLineByLine(tokenizationFilePath); original_txt_content = FileUtil.readWholeFile(txt_file_path).replaceAll("\\s+", " "); compressedText = original_txt_content.replaceAll(" |\\n", ""); if(compressedText.equals("")) return; int curIndex = 0; String compressed_original_sofar = ""+original_txt_content.charAt(curIndex); String compressed_tokenized_sofar = ""; int sentence_start=0; for(int line_number = 0;line_number<lines_tokenized.size();line_number++){ String line = lines_tokenized.get(line_number); List<Word> tokensInSentence = new ArrayList<Word>(); String[] tokensInSentence_str = line.split(" "); ArrayList<Integer> tokens_indexes = new ArrayList<Integer>(); for(int token_index = 0;token_index<tokensInSentence_str.length;token_index++) { if(!tokensInSentence_str[token_index].equals("")) { String tmp_compressed_tokenized_sofar = compressed_tokenized_sofar + tokensInSentence_str[token_index].charAt(0); while(!compressed_original_sofar.equals( tmp_compressed_tokenized_sofar)) { do{ curIndex++; compressed_original_sofar += original_txt_content.charAt(curIndex); }while(original_txt_content.charAt(curIndex) == ' '); compressed_original_sofar = compressed_original_sofar.replaceAll(" |\\n", ""); } } tokens_indexes.add(curIndex); tokensInSentence.add(new Word(tokensInSentence_str[token_index], curIndex, curIndex+tokensInSentence_str[token_index].length())); compressed_tokenized_sofar += tokensInSentence_str[token_index]; } sentences_tokens_indexes.put(line_number, tokens_indexes); sentences_tokens.put(line_number, tokensInSentence); sentences.put(line_number, line); sentence_start+= line.length(); } } public HashMap<Integer, String> getSentences() { return sentences; } }