/* * Copyright (c) 2011, SOCIETIES Consortium (WATERFORD INSTITUTE OF TECHNOLOGY (TSSG), HERIOT-WATT UNIVERSITY (HWU), SOLUTA.NET * (SN), GERMAN AEROSPACE CENTRE (Deutsches Zentrum fuer Luft- und Raumfahrt e.V.) (DLR), Zavod za varnostne tehnologije * informacijske držbe in elektronsko poslovanje (SETCCE), INSTITUTE OF COMMUNICATION AND COMPUTER SYSTEMS (ICCS), LAKE * COMMUNICATIONS (LAKE), INTEL PERFORMANCE LEARNING SOLUTIONS LTD (INTEL), PORTUGAL TELECOM INOAÇÃO, SA (PTIN), IBM Corp., * INSTITUT TELECOM (ITSUD), AMITEC DIACHYTI EFYIA PLIROFORIKI KAI EPIKINONIES ETERIA PERIORISMENIS EFTHINIS (AMITEC), TELECOM * ITALIA S.p.a.(TI), TRIALOG (TRIALOG), Stiftelsen SINTEF (SINTEF), NEC EUROPE LTD (NEC)) * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following * conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package org.societies.orchestration.cpa.test.util; import java.io.*; import java.net.URI; import java.net.URISyntaxException; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.StringTokenizer; /** * Created with IntelliJ IDEA. * User: epic * Date: 12/7/12 * Time: 11:03 AM * To change this template use File | Settings | File Templates. */ public class SentenceExtractor { private File f = null; private ArrayList<String> allsentences = null; public SentenceExtractor(URI file){ allsentences = new ArrayList<String>(); String data = SentenceExtractor.readFile(file); //System.out.println("data start: "+data.substring(0,3000)); //allsentences = data.split("[\\p{P}]"); String[] templist = data.split("(?<=[a-z])\\.\\s+"); for(int i=0;i<templist.length;i++) if(templist[i].length()<500) allsentences.add(templist[i]); templist = null; //System.out.println("allsentences lenght: "+allsentences.length); //for(int i=0;i<10;i++) // System.out.println("i: "+i+" sentence: \""+allsentences[i]+"\""); data=null; } private static String readFile(URI path) { FileInputStream stream = null; try { stream = new FileInputStream(new File(path)); FileChannel fc = stream.getChannel(); MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size()); /* Instead of using default, pass in a decoder. */ return Charset.defaultCharset().decode(bb).toString(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { assert stream != null; try { stream.close(); } catch (IOException e) { e.printStackTrace(); } } return null; } public String[] getSentences(int start, int n){ String[] ret = new String[n]; for(int i=start;(i-start)<n && i<allsentences.size();i++) ret[(i-start)] = allsentences.get(i).trim(); return ret; } public int size(){return allsentences.size();} public static void main(String args[]){ SentenceExtractor extractor = null; try { extractor = new SentenceExtractor(SentenceExtractor.class.getClassLoader().getResource("reuters21578content.txt").toURI()); } catch (URISyntaxException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } String sentence = extractor.getSentences(20,1)[0]; System.out.println("sentence 1 : \""+sentence+"\""); //small test of lengths.. int count = 0; int count2 = 0; for(int i =0; i <extractor.size();i++) { sentence = extractor.getSentences(i,1)[0]; if(sentence.length()>500){ //System.out.println("very long sentence ("+sentence.length()+"): \""+sentence+"\""); count ++; } if(sentence.length()>1500){ //System.out.println("very long sentence ("+sentence.length()+"): \""+sentence+"\""); count2 ++; System.out.println("found very long sentence of length: "+sentence.length()); /*if(sentence.length()>4800) System.out.println("longest sentence: \""+sentence+"\"");*/ } } System.out.println("count: "+count); System.out.println("count2: "+count2); System.out.println("totalcount: "+extractor.size()); } }