/* * Copyright 2004-2010 Information & Software Engineering Group (188/1) * Institute of Software Technology and Interactive Systems * Vienna University of Technology, Austria * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package at.tuwien.ifs.somtoolbox.summarisation.parser; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; /** * @author Julius Penaranda * @version $Id: SentenceParser.java 3590 2010-05-21 10:43:45Z mayer $ */ public class SentenceParser { private String prefix = null; public ArrayList<String>[] parsedDocuments = null; private int numDocs = 0; private ArrayList<String> filenames = new ArrayList<String>(); private HeadlineTextParser headtextparser = null; private DocumentParser docparser = null; @SuppressWarnings("unchecked") public SentenceParser(Object[] itemNames) { this.parsedDocuments = new ArrayList[itemNames.length]; } public void setFileNamePrefix(String fnprefix) { this.prefix = fnprefix; } public void find_parse_Document(String name) { try { File file = new File(prefix + "/" + java.net.URLDecoder.decode(name, "UTF-8")); filenames.add(name); // if (file.isDirectory()) { // File[] files = file.listFiles(); // if (files != null) { // for (int i = 0; i < files.length; i++) { // if (files[i].getName().equals(name)) { if (file.exists()) { System.out.println("Document found: " + name); System.out.println("Parsing document: " + name); this.parsedDocuments[numDocs++] = parseDocument(file); System.out.println("sentences in document " + file + ": " + (this.parsedDocuments[numDocs - 1].size() - 1)); // System.out.println("parsedDocument: "+ parsedDocuments.length); } else { System.out.println("File not found: " + file.getAbsolutePath()); } // } // } // else { // System.out.println("SentenceParser: Please specify data item path"); // } } catch (IOException io) { System.err.println("an IO-Error occured"); } } private ArrayList<String> parseDocument(File document) throws IOException { FileInputStream fstream = new FileInputStream(document); BufferedReader d = new BufferedReader(new InputStreamReader(fstream)); ArrayList<String> result = new ArrayList<String>(); String line = d.readLine(); while (line.equals("")) { // ignore blank spaces line = d.readLine(); } if (line.indexOf("<HEADLINE", 0) != -1) { if (this.headtextparser == null) { headtextparser = new HeadlineTextParser(this); } headtextparser.setReader(d); headtextparser.parse(); result = headtextparser.getDocument(); } else if (line.indexOf("Subject") != -1) { System.out.println("skipping header line: " + line); } else { if (this.docparser == null) { docparser = new DocumentParser(this); } docparser.setReader(d); docparser.newDoc(); docparser.storeTitle(line); docparser.storeText(); result = docparser.getDocument(); } return result; } /** finds sentence within String and add it to parseddoc */ public String findSentence(String line, ArrayList<String> parseddoc) { int offS = 0; int index = 0; boolean found = false; while (line.indexOf(".", offS) != -1 || line.indexOf("; ", offS) != -1 || line.indexOf("? ", offS) != -1 || line.indexOf("! ", offS) != -1) { index = line.length(); if (line.indexOf(".", offS) != -1) { if (index >= line.indexOf(".", offS)) { // check what comes after '.' // if ". " is at the end of line if (line.indexOf(". ", offS) + 2 == line.length()) { System.out.println("special case: eol"); index = line.indexOf(". ", offS); found = true; } else if (line.indexOf(".", offS) + 1 == line.length()) { System.out.println("special case2: eol"); index = line.indexOf(".", offS); found = true; } else { // check what comes after "." char y = line.charAt(line.indexOf(".", offS) + 1); // if y=='"' if (y == 34) { index = line.indexOf(".", offS) + 1; System.out.println("case anfuehrungszeichen"); found = true; } // check what comes after ". " if (!found && line.indexOf(". ", offS) != -1) { char z = line.charAt(line.indexOf(". ", offS) + 2); // if z is number, lower case, '-' or '(' if (z >= 49 && z <= 57 || z >= 97 && z <= 122 || z == 45 || z == 40) { System.out.println("case lower, number"); offS = line.indexOf(". ", offS) + 2; } else { System.out.println("normal case"); index = line.indexOf(". ", offS); found = true; } } else { offS = line.indexOf(".", offS) + 1; } } } } if (line.indexOf("; ", offS) != -1) { System.out.println("; gefunden"); if (index >= line.indexOf("; ", offS)) { index = line.indexOf("; ", offS); } found = true; } if (line.indexOf("? ", offS) != -1) { if (index >= line.indexOf("? ", offS)) { index = line.indexOf("? ", offS); } found = true; } if (line.indexOf("! ", offS) != -1) { if (index >= line.indexOf("! ", offS)) { index = line.indexOf("! ", offS); } found = true; } if (found) { // System.out.println("index: "+index+" ll: "+line.length()); if (line.substring(0, index + 1).indexOf("&UR") != -1) { System.out.println("&UR wird ignoriert"); } else { parseddoc.add(line.substring(0, index + 1)); } // System.out.println("Sentence geaddet: "+line.substring(0, index+1)+" index: "+index+1); line = line.substring(index + 1); System.out.println("line �brig: " + line); System.out.println("offs: " + offS); found = false; offS = 0; } } return line; } /** * deletes tags within a Web document * * @param line String * @return String */ String delete_tags(String line) { Character sign; char[] chars; boolean not_add = false; String parsedline = ""; chars = line.toCharArray(); for (char c : chars) { sign = new Character(c); if (sign.equals(new Character('<'))) { not_add = true; } else if (sign.equals(new Character('>')) && not_add) { not_add = false; } else if (!not_add) { parsedline = parsedline + sign.toString(); } } return parsedline; } public ArrayList<String>[] getParsedDocuments() { return this.parsedDocuments; } public ArrayList<String> getFileNames() { return filenames; } } class HeadlineTextParser { private SentenceParser sParser = null; private BufferedReader reader = null; private ArrayList<String> document = null; public HeadlineTextParser(SentenceParser parser) { this.sParser = parser; } public void setReader(BufferedReader rd) { this.reader = rd; } public void parse() { boolean start = false; document = new ArrayList<String>(); try { // store title // String title= sParser.delete_tags(line); String sentence = new String(""); String title = ""; String line = reader.readLine(); while (line.indexOf("</HEADLINE", 0) == -1) { title = title + sParser.delete_tags(line); line = reader.readLine(); } document.add(title); // store text while (line != null) { if (!line.equals("")) { if (line.indexOf("<TEXT", 0) > -1) { start = true; } if (line.indexOf("</TEXT", 0) > -1) { if (sentence.length() > 2) { document.add(sentence); } System.out.println("stop parsing.."); start = false; } if (start) { if (line.indexOf(".", 0) != -1 || line.indexOf(";", 0) != -1 || line.indexOf("?", 0) != -1 || line.indexOf("!", 0) != -1) { sentence = sParser.findSentence(sParser.delete_tags(sentence + " " + line), this.document); } else { sentence = sentence + " " + line; } } } line = reader.readLine(); } } catch (Exception e) { System.err.println(e.getMessage()); } } public ArrayList<String> getDocument() { return this.document; } } class DocumentParser { private SentenceParser sParser = null; private BufferedReader reader = null; private ArrayList<String> document = null; public DocumentParser(SentenceParser parser) { this.sParser = parser; } public void setReader(BufferedReader rd) { this.reader = rd; } public void newDoc() { this.document = new ArrayList<String>(); } public void storeTitle(String line) { document.add(line); } public void storeText() { try { String sentence = new String(""); String line = reader.readLine(); // store text while (line != null) { if (!line.equals("")) { if (line.indexOf(".", 0) != -1 || line.indexOf(";", 0) != -1 || line.indexOf("?", 0) != -1 || line.indexOf("!", 0) != -1) { sentence = sParser.findSentence(sParser.delete_tags(sentence + " " + line), this.document); } else { sentence = sentence + " " + line; } } line = reader.readLine(); } /* * System.out.println("changing code for lyrics corpora"); System.out.println("parsing lyrics document.."); while(line!=null) { * this.document.add(line); line = reader.readLine(); } */ } catch (Exception e) { System.err.println(e.getMessage()); } } public ArrayList<String> getDocument() { return this.document; } }