/* * PreprocessedDocument.java * Copyright (C) 2007 David Milne, d.n.milne@gmail.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.wikipedia.miner.annotation.preprocessing; import java.util.* ; /** * This class stores a document that is ready to be processed by linkDetector, disambiguator, documentTagger, etc. * * @author David Milne */ public class PreprocessedDocument { private final String originalText ; private final String preprocessedText ; private final String contextText ; private HashSet<Integer> bannedTopics ; private final ArrayList<RegionTag> regionTags ; //region tracking private List<HashSet<Integer>> doneIdsStack ; private HashSet<Integer> doneIds ; private int nextTagIndex ; /** * Initializes a preprocessed document. You should not use this yourself, instead let the relevant documentPreprocessor create the document. * * @param originalText the unmodified, original markup. * @param preprocessedText the modified text, stripped of all markup. * @param contextText any additional text that can help disambiguate terms or judge their importance (e.g. metadata) * @param regionTags the region tags detected in the document. * @param bannedTopics a set of ids for topics that you don't want to be detected in the document. */ public PreprocessedDocument(String originalText, String preprocessedText, String contextText, ArrayList<RegionTag> regionTags, HashSet<Integer>bannedTopics) { this.originalText = originalText ; this.preprocessedText = preprocessedText ; this.contextText = contextText ; this.bannedTopics = bannedTopics ; this.regionTags = regionTags ; if (this.bannedTopics == null) this.bannedTopics = new HashSet<Integer>() ; resetRegionTracking() ; } /** * Resets the information that has been recorded about which regions have been seen (and topics seen within them) so far. * * This should only be used by the document tagger. */ public void resetRegionTracking() { doneIdsStack = new ArrayList<HashSet<Integer>>() ; doneIds = new HashSet<Integer>() ; nextTagIndex = 0 ; } /** * @return the original markup of the document. */ public String getOriginalText(){ return originalText ; } /** * @return the content of the document, stripped of all markup */ public String getPreprocessedText(){ return preprocessedText ; } /** * @return any additional text (metadata, etc) that may be helpful for disambiguating terms or judging their importance */ public String getContextText() { return contextText ; } /** * bans a topic so that it will not be detected in the document * * @param topicId the id of the topic to be banned. */ public void banTopic(int topicId) { bannedTopics.add(topicId) ; } /** * @return the set of all ids that have been banned from being detected in the document */ public HashSet<Integer> getBannedTopics() { return bannedTopics ; } /** * @param topicId the id of the topic to check * @return true if the given topic is banned, otherwise false */ public boolean isTopicBanned(int topicId) { return bannedTopics.contains(topicId) ; } /** * @return the index of the region we are currently looking at (should only be used by documentTagger) */ public int getCurrRegionIndex() { return nextTagIndex ; } /** * @param pos the character position of the document where we are currently looking. * @return the set of ids for all topics that we have seen already in the region surrounding the given pos. */ public HashSet<Integer> getDoneIdsInCurrentRegion(int pos) { //System.out.println(" - currPos=" + pos + "nextIndex=" + nextTagIndex + ", maxIndex=" + regionTags.size()) ; if (nextTagIndex >= regionTags.size()) { // no more tags, so just return last set we looked at return doneIds ; } while (nextTagIndex < regionTags.size()) { RegionTag nextTag = regionTags.get(nextTagIndex) ; //System.out.println(" - nextTag=" + nextTag) ; if (nextTag.getPosition() < pos) { // we have passed this tag nextTagIndex ++ ; //System.out.println(" - passed " + nextTag) ; if (nextTag.getType() == RegionTag.REGION_SPLIT) doneIds = new HashSet<Integer>() ; if (nextTag.getType() == RegionTag.REGION_CLOSE) { //pop previous doneIds if (doneIdsStack.isEmpty()) doneIds = new HashSet<Integer>() ; else { doneIds = doneIdsStack.get(doneIdsStack.size()-1); doneIdsStack.remove(doneIdsStack.size()-1) ; } } if (nextTag.getType() == RegionTag.REGION_OPEN) { //push new doneIds ; doneIdsStack.add(doneIds) ; doneIds = new HashSet<Integer>() ; } } else { //this tag is the next we will encounter break ; } } return doneIds ; } protected static class RegionTag implements Comparable<RegionTag> { /** * specifies an opening tag of a region */ public static final int REGION_OPEN = 1 ; /** * specifies a closing tag of a region */ public static final int REGION_CLOSE = 2 ; /** * specifies a tag that splits a region */ public static final int REGION_SPLIT = 3 ; private final int pos ; private final int type ; /** * Initializes a region tag with the given type and location * * @param pos the character position where this tag starts * @param type (REGION_OPEN, REGION_CLOSE or REGION_SPLIT) */ public RegionTag(int pos, int type) { this.pos = pos ; this.type = type ; } /** * @return the character position where this tag starts */ public int getPosition() { return pos ; } /** * @return the type of this tag (REGION_OPEN, REGION_CLOSE or REGION_SPLIT) */ public int getType() { return type ; } @Override public int compareTo(RegionTag rt) { return new Integer(pos).compareTo(rt.getPosition()) ; } @Override public String toString() { switch(type) { case REGION_OPEN: return "(" + pos + ",OPEN)" ; case REGION_CLOSE: return "(" + pos + ",CLOSE)" ; case REGION_SPLIT: return "(" + pos + ",SPLIT)" ; } return "(" + pos + ",UNKOWN)" ; } } }