MarkupStripper.java example

Explorer

wikipediaminer-master
- src
  - main
    - java
      - org
        wikipedia_miner
        wikipedia_miner
        App.java
  - test
    - java
      - org
        wikipedia_miner
        wikipedia_miner
        AppTest.java
- wikipedia-miner-core
  - src
    - main
      - java
        org
        wikipedia
        miner
        annotation
        ArticleCleaner.java
        Context.java
        Disambiguator.java
        Topic.java
        TopicDetector.java
        TopicReference.java
        preprocessing
        DocumentPreprocessor.java
        HtmlPreprocessor.java
        PreprocessedDocument.java
        WikiPreprocessor.java
        tagging
        DocumentTagger.java
        HtmlTagger.java
        WikiTagger.java
        weighting
        LinkDetector.java
        SimpleDocumentIndexer.java
        TopicIndexer.java
        TopicWeighter.java
        comparison
        ArticleComparer.java
        ArticleComparison.java
        ComparisonDataSet.java
        ConnectionSnippet.java
        ConnectionSnippetWeighter.java
        LabelComparer.java
        db
        IntObjectDatabase.java
        LabelDatabase.java
        MarkupDatabase.java
        PageLinkCountDatabase.java
        RecordBinding.java
        TitleDatabase.java
        WDatabase.java
        WDatabaseFactory.java
        WEntry.java
        WEnvironment.java
        WIterator.java
        struct
        DbIntList.java
        DbIntPair.java
        DbLabel.java
        DbLabelForPage.java
        DbLabelForPageList.java
        DbLinkLocation.java
        DbLinkLocationList.java
        DbPage.java
        DbPageLinkCounts.java
        DbSenseForLabel.java
        DbSentenceSplitList.java
        DbTranslations.java
        model
        Article.java
        Category.java
        Disambiguation.java
        Label.java
        Page.java
        Redirect.java
        Template.java
        Wikipedia.java
        util
        ArticleSet.java
        ArticleSetBuilder.java
        CorrelationCalculator.java
        EmphasisResolver.java
        EnvironmentBuilder.java
        LabelIterator.java
        MarkupStripper.java
        NGrammer.java
        PageIterator.java
        Position.java
        ProgressTracker.java
        RelatednessCache.java
        Result.java
        TopicIndexingSet.java
        WikipediaConfiguration.java
        text
        CaseAccentSimpleTextProcessor.java
        CaseFolder.java
        Cleaner.java
        PlingStemmer.java
        PorterStemmer.java
        SimpleStemmer.java
        StopwordRemover.java
        TextProcessor.java
        yagoUtils
        FinalMap.java
        FinalSet.java
    - test
      - java
        org
        wikipedia
        miner
        core
        AppTest.java
- wikipedia-miner-examples
  - src
    - main
      - java
        org
        wikipedia
        miner
        examples
        AnnotationWorkbench.java
        ComparisonWorkbench.java
        WikipediaDefiner.java
    - test
      - java
        org
        wikipedia
        miner
        examples
        AppTest.java
- wikipedia-miner-extract
  - src
    - main
      - java
        org
        wikipedia
        miner
        extract
        DumpExtractor.java
        model
        DumpLink.java
        DumpLinkParser.java
        DumpPage.java
        DumpPageParser.java
        struct
        LabelOccurrences.java
        LabelSense.java
        LabelSenseList.java
        LabelSummary.java
        LinkSummary.java
        PageDepthSummary.java
        PageDetail.java
        PageKey.java
        PageSummary.java
        PrimaryLabels.java
        steps
        IterativeStep.java
        LocalStep.java
        Step.java
        finalSummary
        FinalSummaryStep.java
        labelOccurrences
        CombinerOrReducer.java
        LabelCache.java
        LabelOccurrenceStep.java
        Mapper.java
        labelSenses
        CombinerOrReducer.java
        LabelSensesStep.java
        Mapper.java
        pageDepth
        DepthCombinerOrReducer.java
        InitialDepthMapper.java
        PageDepthStep.java
        SubsequentDepthMapper.java
        pageSummary
        CombinerOrReducer.java
        InitialMapper.java
        PageSummaryStep.java
        SubsequentMapper.java
        primaryLabel
        PrimaryLabelStep.java
        sortedPages
        PageSortingStep.java
        util
        Languages.java
        PageSentenceExtractor.java
        SiteInfo.java
        UncompletedStepException.java
        Util.java
        XmlInputFormat.java
    - test
      - java
        org
        wikipedia
        miner
        extract
        LinkMarkupHandling.java
        MarkupTestCase.java
        TestMarkupHandling.java
- wikipedia-miner-web
  - src
    - main
      - java
        org
        wikipedia
        miner
        web
        service
        CompareService.java
        CorrectService.java
        ExploreArticleService.java
        ExploreCategoryService.java
        ListWikipediasService.java
        ProgressService.java
        SearchService.java
        StatsService.java
        SuggestService.java
        WMHub.java
        WMService.java
        WikifyService.java
        util
        CharsetFilter.java
        HubConfiguration.java
        ImageRetriever.java
        MarkupFormatter.java
        UtilityMessages.java
        WebContentRetriever.java
        xjsfParameters
        StringListParameter.java

/*
 *    MarkupStripper.java
 *    Copyright (C) 2007 David Milne, d.n.milnegmail.com
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

package org.wikipedia.miner.util ;

import java.util.*;
import java.util.regex.*;


/**
 * This provides tools to strip out markup from wikipedia articles, or anything else that has been written
 * in mediawiki's format. It's all pretty simple, so don't expect perfect parsing. It is particularly bad at 
 * dealing with templates (these are simply removed rather than resolved).  
 */
public class MarkupStripper {

	private Pattern linkPattern = Pattern.compile("\\[\\[(.*?:)?(.*?)(\\|.*?)?\\]\\]") ;
	
	private Pattern isolatedBefore = Pattern.compile("(\\s*|.*\\n(\\s*))", Pattern.DOTALL) ;
	private Pattern isolatedAfter = Pattern.compile("(\\s*|(\\s*)\\n.*)", Pattern.DOTALL) ;
	
	private EmphasisResolver emphasisResolver = new EmphasisResolver() ;

	/**
	 * Returns a copy of the given markup, where all markup has been removed except for 
	 * internal links to other wikipedia pages (e.g. to articles or categories), section 
	 * headers, list markers, and bold/italic markers. 
	 * 
	 * By default, unwanted markup is completely discarded. You can optionally specify 
	 * a character to replace the regions that are discared, so that the length of the 
	 * string and the locations of unstripped characters is not modified.
	 */
	public String stripAllButInternalLinksAndEmphasis(String markup, Character replacement) {

		//deal with comments and math regions entirely seperately. 
		//Comments often contain poorly nested items that the remaining things will complain about.
		//Math regions contain items that look confusingly like templates.
		Vector<int[]> regions = gatherSimpleRegions(markup, "\\<\\!--(.*?)--\\>") ;
		regions = mergeRegionLists(regions, gatherComplexRegions(markup, "\\<math(\\s*?)([^>\\/]*?)\\>", "\\<\\/math(\\s*?)\\>")) ;
		String clearedMarkup = stripRegions(markup, regions, replacement) ;

		//deal with templates entirely seperately. They often end in |}} which confuses the gathering of tables.
		regions = gatherTemplates(clearedMarkup) ;
		clearedMarkup = stripRegions(clearedMarkup, regions, replacement) ;

		//now gather all of the other regions we want to ignore	
		regions = gatherTables(clearedMarkup) ;

		regions = mergeRegionLists(regions, gatherHTML(clearedMarkup)) ;
		regions = mergeRegionLists(regions, gatherExternalLinks(clearedMarkup)) ;
		regions = mergeRegionLists(regions, gatherMagicWords(clearedMarkup)) ;

		//ignore these regions now (they need to be blanked before we can correctly identify the remaining regions)
		clearedMarkup = stripRegions(clearedMarkup, regions, replacement) ;
		
		//System.out.println("Prior to removing misformatted start: ") ;
		//System.out.println(" - " + clearedMarkup) ;

		regions = gatherMisformattedStarts(clearedMarkup) ;
		clearedMarkup = stripRegions(clearedMarkup, regions, replacement) ;
		
		
		return clearedMarkup ;
	}


	/**
	 * Returns a copy of the given markup, where all links to wikipedia pages 
	 * (categories, articles, etc) have been removed. Links to articles are 
	 * replaced with the appropriate anchor markup. All other links are removed completely.
	 * 
	 * By default, unwanted markup is completely discarded. You can optionally specify 
	 * a character to replace the regions that are discarded, so that the length of the 
	 * string and the locations of unstripped characters is not modified.
	 */
	public String stripInternalLinks(String markup, Character replacement) {

		Vector<int[]> regions = gatherComplexRegions(markup, "\\[\\[", "\\]\\]") ;

		StringBuffer strippedMarkup = new StringBuffer() ;
		int lastPos = markup.length() ;

		//because regions are sorted by end position, we work backwards through them
		int i = regions.size() ;

		while (i > 0) {
			i -- ;

			int[] region = regions.elementAt(i) ;

			//only deal with this region is not within a region we have already delt with. 
			if (region[0] < lastPos) {

				//copy everything between this region and start of last one we dealt with. 
				strippedMarkup.insert(0,markup.substring(region[1], lastPos)) ;

				String linkMarkup = markup.substring(region[0], region[1]) ;

				// by default (if anything goes wrong) we will keep the link as it is
				String strippedLinkMarkup = linkMarkup ;


				Matcher m = linkPattern.matcher(linkMarkup) ;
				if (m.matches()) {

					String prefix = m.group(1) ;
					String dest = m.group(2) ;
					String anchor = m.group(3) ;

					if (prefix != null) {
						// this is not a link to another article, so get rid of it entirely
						if (replacement != null) 
							strippedLinkMarkup = linkMarkup.replaceAll(".",replacement.toString()) ;			
						else 
							strippedLinkMarkup = "" ;
					} else {
						if (anchor != null) {
							//this has an anchor defined, so use that but blank out everything else

							if (replacement != null) 
								strippedLinkMarkup = replacement + replacement + dest.replaceAll(".", replacement.toString()) + replacement + anchor.substring(1) + anchor.substring(1) + replacement ;
							else
								strippedLinkMarkup = anchor.substring(1) ;

						} else {
							//this has no anchor defined, so treat dest as anchor and blank out everything else

							if (replacement != null) {
								strippedLinkMarkup = replacement + replacement + dest + replacement + replacement ;
							} else {
								strippedLinkMarkup = dest ;
							}
						}
					}				
				} else {
					//logProblem("our pattern for delimiting links has a problem") ;
				}

				strippedMarkup.insert(0,strippedLinkMarkup) ;
				lastPos = region[0] ;
			}
		}	

		if (lastPos > 0) 
			strippedMarkup.insert(0,markup.substring(0, lastPos)) ;

		return strippedMarkup.toString() ; 	
	}
	
	
	
	public String stripEmphasis(String markup, Character replacement) {
		
		String resolvedMarkup = emphasisResolver.resolveEmphasis(markup) ;
		
		Vector<int[]> regions = gatherSimpleRegions(resolvedMarkup, "\\<\\/?[bi]\\>") ;
		
		StringBuffer clearedMarkup = new StringBuffer() ;
		int lastPos = 0 ;
		
		int i = regions.size() ;

		while (i > 0) {
			i -- ;

			int[] region = regions.elementAt(i) ;


			//only deal with this region is not within a region we have already dealt with. 
			if (region[0] < lastPos) {

				//print (" - - dealing with it\n") ;

				//copy markup after this region and before beginning of the last region we dealt with
				if (region[1] < lastPos) 
					clearedMarkup.insert(0, resolvedMarkup.substring(region[1], lastPos)) ;

				if (replacement != null) {
					
					String tag = resolvedMarkup.substring(region[0], region[1]) ;
					String fill ;
					if (tag.matches("\\<\\/?b\\>"))
						fill = "'''" ;
					else
						fill = "''" ;
					
					fill.replaceAll(".", replacement.toString()) ;
					
					clearedMarkup.insert(0, fill) ;
				}

				lastPos = region[0] ;
			} else {
				//print (" - - already dealt with\n") ;

			}
		}

		clearedMarkup.insert(0, resolvedMarkup.substring(0, lastPos)) ;	
		return clearedMarkup.toString() ;

	}
	

	/**
	 * Returns a copy of the given markup, where all links to wikipedia pages
	 * that are not articles (categories, language links, etc) have been removed.
	 * 
	 * By default, unwanted markup is completely discarded. You can optionally specify
	 * a character to replace the regions that are discarded, so that the length of the
	 * string and the locations of unstripped characters is not modified.
	 */
	public String stripNonArticleInternalLinks(String markup, Character replacement) {

		//currItem = "non-article internal links" ;

		Vector<int[]> regions = gatherComplexRegions(markup, "\\[\\[", "\\]\\]") ;

		StringBuffer strippedMarkup = new StringBuffer() ;
		int lastPos = markup.length() ;

		//because regions are sorted by end position, we work backwards through them
		int i = regions.size() ;

		while (i > 0) {
			i -- ;

			int[] region = regions.elementAt(i) ;
			
			//System.out.println(" - - REGION: " + markup.substring(region[0], region[1])) ;

			//only deal with this region is not within a region we have already delt with. 
			if (region[0] < lastPos) {

				//copy everything between this region and start of last one we dealt with. 
				strippedMarkup.insert(0, markup.substring(region[1], lastPos)) ;

				String linkMarkup = markup.substring(region[0], region[1]) ;

				//print("link [region[0],region[1]] = linkMarkup\n\n") ;

				// by default (if anything goes wrong) we will keep the link as it is
				String strippedLinkMarkup = linkMarkup ;
				Matcher m = linkPattern.matcher(linkMarkup) ;
				if (m.matches()) {

					String prefix = m.group(1) ;
					//String dest = m.group(2) ;
					//String anchor = m.group(3) ;

					if (prefix != null) {
						// this is not a link to another article, so get rid of it entirely
						if (replacement != null) {
							strippedLinkMarkup = linkMarkup.replaceAll(".", replacement.toString()) ;			
						} else {
							strippedLinkMarkup = "" ;
						}
					} 

				} else {
					//logProblem("our pattern for delimiting links has a problem") ;
				}

				strippedMarkup.insert(0, strippedLinkMarkup) ;
				lastPos = region[0] ;
			}
		}	

		if (lastPos > 0) 
			strippedMarkup.insert(0, markup.substring(0, lastPos)) ;

		return strippedMarkup.toString() ; 	
	}


	/**
	 * Removes all sections (both header and content, including nested sections) with the given sectionNames
	 * 
	 * @param sectionName the name of the section (case insensitive) to remove.
	 * @param markup the markup to be stripped
	 * @return the stripped markup
	 */
	public String stripSections(String markup, String[] sectionNames, Character replacement) {
		
		Vector<int[]> regions = new Vector<int[]>() ;
		
		for (String sectionName:sectionNames) 
			regions = mergeRegionLists(regions, gatherSection(markup, sectionName)) ;

		return stripRegions(markup, regions, replacement) ;
	}
	
	public String stripSectionHeaders(String markup, Character replacement) {
		
		Vector<int[]> regions = this.gatherSectionHeaders(markup) ;
		return stripRegions(markup, regions, replacement) ;
		
	}

	/**
	 * Convenience method which combines both of the above methods - i.e. returns a copy of the
	 * given markup, where all markup has been removed except for section headers and list markers.
	 *
	 * By default, unwanted markup is completely discarded. You can optionally specify 
	 * a character to replace the regions that are discared, so that the length of the 
	 * string and the locations of unstripped characters is not modified. 
	 */

	public String stripToPlainText(String markup, Character replacement) {

		String clearedMarkup = stripAllButInternalLinksAndEmphasis(markup, replacement) ;
		clearedMarkup = stripInternalLinks(clearedMarkup, replacement) ;

		return clearedMarkup ;	
	}



	/**
	 * Returns a copy of the given markup, where the given regions have been removed. 
	 * Regions are identified using one of the gather methods.
	 * 
	 * By default, unwanted markup is completely discarded. You can optionally specify
	 * a character to replace the regions that are discared, so that the length of the 
	 * string and the locations of unstripped characters is not modified.
	 */
	public String stripRegions(String markup, Vector<int[]> regions, Character replacement) {

		StringBuffer clearedMarkup = new StringBuffer() ;

		int lastPos = markup.length() ;

		//because regions are sorted by end position, we work backwards through them
		int i = regions.size() ;

		while (i > 0) {
			i -- ;

			int[] region = regions.elementAt(i) ;


			//only deal with this region is not within a region we have already delt with. 
			if (region[0] < lastPos) {

				//print (" - - dealing with it\n") ;

				//copy markup after this region and before beginning of the last region we delt with
				if (region[1] < lastPos) 
					clearedMarkup.insert(0, markup.substring(region[1], lastPos)) ;

				if (replacement != null) {
					String fill = markup.substring(region[0],region[1]).replaceAll(".", replacement.toString()) ;
					clearedMarkup.insert(0, fill) ;
				}

				lastPos = region[0] ;
			} else {
				//print (" - - already dealt with\n") ;

			}
		}

		clearedMarkup.insert(0, markup.substring(0, lastPos)) ;	
		return clearedMarkup.toString() ;
	}
	
	
	
	public String stripExcessNewlines(String markup) {
		
		String strippedMarkup = markup.replaceAll("\n{3,}", "\n\n") ;
		return strippedMarkup.trim();
	}


//	======================================================================================================

	/**
	 * Gathers areas within the markup which correspond to links to other wikipedia pages
	 * (as identified by [[ and ]] pairs). Note: these can be nested (e.g. for images)
	 */
	public Vector<int[]> gatherInternalLinks(String markup) {
		//currItem = "internal links" ;

		return gatherComplexRegions(markup, "\\[\\[", "\\]\\]") ;
	}
	


	/**
	 * Gathers areas within the markup which correspond to templates (as identified by {{ and }} pairs). 
	 */
	public Vector<int[]> gatherTemplates(String markup) {
		//currItem = "templates" ;
		return gatherComplexRegions(markup, "\\{\\{", "\\}\\}") ;
	}
	
	public Vector<int[]> getIsolatedRegions(Vector<int[]> regions, String markup) {
		
		Vector<int[]> isolatedRegions = new Vector<int[]>() ;
		
		for (int[] region:regions) {
			if (isIsolated(region, markup))
				isolatedRegions.add(region) ;
		} ;
		
		return isolatedRegions ;
	}
	
	public Vector<int[]> excludeIsolatedRegions(Vector<int[]> regions, String markup) {
		
		Vector<int[]> unisolatedRegions = new Vector<int[]>() ;
		
		for (int[] region:regions) {
			if (!isIsolated(region, markup))
				unisolatedRegions.add(region) ;
		} ;
		
		return unisolatedRegions ;
	}
	
	private boolean isIsolated(int[] region, String markup) {
		
		String before = markup.substring(0, region[0]) ;
		String after = markup.substring(region[1]) ;
		
		Matcher m = isolatedBefore.matcher(before) ;
		if (!m.matches())
			return false ;
		
		m = isolatedAfter.matcher(after) ;
		if(!m.matches())
			return false ;
		
		return true ;
	}

	/**
	 * Gathers areas within the markup which correspond to tables (as identified by {| and |} pairs). 
	 */
	public Vector<int[]> gatherTables(String markup) {
		//currItem = "tables" ;
		return gatherComplexRegions(markup, "\\{\\|", "\\|\\}") ;
	}
	
	
	

	/**
	 * Gathers areas within the markup which correspond to html tags. 
	 * 
	 * DIV and REF regions will enclose beginning and ending tags, and everything in between,
	 * since we assume this content is supposed to be discarded. All other regions will only include the
	 * individual tag, since we assume the content between such pairs is supposed to be retained. 
	 */
	public Vector<int[]> gatherHTML(String markup) {

		//currItem = "html" ;

		//gather and merge references
		Vector<int[]> regions = gatherReferences(markup) ;

		//gather <div> </div> pairs
		regions = mergeRegionLists(regions, gatherComplexRegions(markup, "\\<div(\\s*?)([^>\\/]*?)\\>", "\\<\\/div(\\s*?)\\>")) ;
		
		//gather remaining tags
		regions = mergeRegionLists(regions, gatherSimpleRegions(markup, "\\<(.*?)\\>")) ;
		
		return regions ;
	}


	/**
	 * Gathers areas within the markup which correspond to references (markup to support claims or facts).
	 * The regions will enclose beginning and ending tags, and everything in between,
	 * since we assume this content is supposed to be discarded. 
	 */
	public Vector<int[]> gatherReferences(String markup) {

		//currItem = "references" ;

		//gather <ref/>
		Vector<int[]> regions = gatherSimpleRegions(markup, "\\<ref(\\s*?)([^>]*?)\\/\\>") ;

		//gather <ref> </ref> pairs (these shouldnt be nested, but what the hell...)
		regions = mergeRegionLists(regions, gatherComplexRegions(markup, "\\<ref(\\s*?)([^>\\/]*?)\\>", "\\<\\/ref(\\s*?)\\>")) ;

		return regions ;
	}


	/**
	 * Gathers items which MediaWiki documentation mysteriously refers to as "majic words": e.g. __NOTOC__
	 */
	public Vector<int[]> gatherMagicWords(String markup) {

		//currItem = "magic words" ;
		return gatherSimpleRegions(markup, "\\_\\_([A-Z]+)\\_\\_") ;
	}

	/**
	 * Gathers all links to external web pages
	 */
	public Vector<int[]> gatherExternalLinks(String markup) {
		//currItem = "external links" ;
		return gatherSimpleRegions(markup, "\\[(http|www|ftp).*?\\]") ;
	}

	/**
	 * Gathers bold and italic markup
	 */
	public Vector<int[]> gatherEmphasis(String markup) {
		//currItem = "emphasis" ;
		return gatherSimpleRegions(markup, "'{2,}") ; 
	}
	
	
	/**
	 * Gathers section headers
	 */
	public Vector<int[]> gatherSectionHeaders(String markup) {

		Vector<int[]> regions = new Vector<int[]>() ;
		
		Pattern p = Pattern.compile("\\n\\s*((={2,})[^=].*?\\2)[^=]") ;
		Matcher m = p.matcher(markup) ;
		
		while (m.find()) {
			int[] region = {m.start(1), m.end(1)} ;
			regions.add(region) ;
		}
		return regions ;
	}
	
	
	public Vector<int[]> gatherSection(String markup, String sectionName) {
		
		Vector<int[]> regions = new Vector<int[]>() ;
		
		//find start of section
		Pattern startP = Pattern.compile("\\n\\s*(={2,})\\s*" + sectionName + "\\s*\\1", Pattern.CASE_INSENSITIVE) ;
		Matcher startM = startP.matcher(markup) ;
		
		if(startM.find()) {
			
			int start = startM.start(1) ;
			int level = startM.group(1).length() ;
			int end ;
			
			//look for start of section that is at same level or higher
			Pattern endP = Pattern.compile("\\n\\s*(={2,"+level+"})[^=].*\\1") ;
			
			Matcher endM = endP.matcher(markup) ;
			
			if (endM.find(startM.end())) 
				end = endM.start() ;
			else
				end = markup.length() -1 ;
				
			int[] region = {start, end} ;
			regions.add(region) ;
		}
			
		return regions ;
	}
	
	

	/**
	 * Gathers markup which indicates indented items, or numbered and unnumbered list items
	 */
	public Vector<int[]> gatherListAndIndentMarkers(String markup) {
		//currItem = "list and intent markers" ;

		Vector<int[]> regions = gatherSimpleRegions(markup, "\n( *)([//*:]+)") ;

		//increment start positions of all regions by one, so they don't include the newline character
		for (int[] region:regions)
			region[0]++ ;

		//add occurance of list item on first line (if there is one)
		regions = mergeRegionLists(regions, gatherSimpleRegions(markup, "^( *)([//*:]+)")) ;
		return regions ;
	}
	
	private boolean isEntirelyItalicised(String line) {
		
		String resolvedLine = emphasisResolver.resolveEmphasis(line) ;
		
		Pattern p = Pattern.compile("(\\s*)\\<i\\>(.*?)\\<\\/i\\>\\.?(\\s*)") ;
		
		Matcher m = p.matcher(resolvedLine) ;
		if (m.matches()) {
			if (m.group(1).contains("</i>"))
				return false ;
			else
				return true ;
		} else {
			return false ;
		}
	}

	/**
	 * Gathers paragraphs within the markup referred to by the given pointer, which are at the 
	 * start and either begin with an indent or are entirely encased in italics. These correspond to quotes or disambiguation and 
	 * navigation notes that the author should have used templates to identify, but didn't. 
	 * This will only work after templates, and before list markers have been cleaned out.
	 */
	public Vector<int[]> gatherMisformattedStarts(String markup) {

		//currItem = "starts" ;

		String[] lines = markup.split("\n") ;

		int ignoreUntil = 0 ;

		for (String line:lines) {

			

			boolean isWhitespace = line.matches("^(\\s*)$") ;
			boolean isIndented = line.matches("^(\\s*):.*") ;
			boolean isItalicised = isEntirelyItalicised(line)  ;
			boolean isImage = line.matches("^(\\s*)\\[\\[Image\\:(.*?)\\]\\](\\s*)") ;
			
			
			
			//System.out.println(" - - '" + line + "' " + isIndented + "," + isItalicised) ;
			
			if (isWhitespace || isIndented || isItalicised || isImage)  {
				//want to ignore this line
				ignoreUntil = ignoreUntil + line.length() + 1 ;	
				//print(" - - - discard\n") ;		
			} else {
				//print(" - - - keep\n") ;
				break ;
			}		
		}
		
		int[] region = {0, ignoreUntil} ;

		Vector<int[]> regions = new Vector<int[]>() ;
		regions.add(region) ;
		
		return regions ;
	}


	/**
	 * Gathers simple regions: ones which cannot be nested within each other.
	 * 
	 *  The returned regions (an array of start and end positions) will be sorted 
	 *  by end position (and also by start position, since they can't overlap) 
	 */ 
	public Vector<int[]> gatherSimpleRegions(String markup, String regex) {

		//an array of regions we have identified
		//each region is given as an array containing start and end character indexes of the region. 
		Vector<int[]> regions = new Vector<int[]>() ;
		
		Pattern p = Pattern.compile(regex, Pattern.DOTALL) ;
		Matcher m = p.matcher(markup) ;

		while(m.find()) {
			int[] region = {m.start(), m.end()} ;
			regions.add(region) ;
		}

		return regions ;
	}


	/**
	 * Gathers complex regions: ones which can potentially be nested within each other.
	 * 
	 * The returned regions (an array of start and end positions) will be either
	 * non-overlapping or cleanly nested, and sorted by end position. 
	 */ 
	public Vector<int[]> gatherComplexRegions(String markup, String startRegex, String endRegex) {

		//an array of regions we have identified
		//each region is given as an array containing start and end character indexes of the region. 
		Vector<int[]> regions = new Vector<int[]>() ;

		//a stack of region starting positions
		Vector<Integer> startStack = new Vector<Integer>() ;
		
		
		Pattern p = Pattern.compile("((" + startRegex + ")|(" + endRegex + "))", Pattern.DOTALL) ;
		Matcher m = p.matcher(markup) ;
		
		while(m.find()) {

			Integer p1 = m.start() ;
			Integer p2 = m.end() ;  
			

			if (m.group(2) != null) {
				//this is the start of an item
				startStack.add(p1) ;
			} else {
				//this is the end of an item
				if (!startStack.isEmpty()) {
					int start = startStack.elementAt(startStack.size()-1) ;
					startStack.removeElementAt(startStack.size()-1) ;
					
					int[] region = {start, p2} ;
					regions.add(region) ;

					//print (" - item [region[0],region[1]]: ".substr(markup, region[0], region[1]-region[0])."\n") ;
				} else {
					//logProblem("oops, we found the end of an item, but have no idea where it started") ;
				}
			}
		}

		if (!startStack.isEmpty()) {
			//logProblem("oops, we got to the end of the markup and still have items that have been started but not finished") ;
		}

		return regions ;
	}


	/**
	 * Collapses a region list, by discarding any regions which are contained within 
	 * other regions.
	 * 
	 * The resulting region list will be non-overlapping and sorted by end positions.
	 *//*
	private Vector<int[]> collapseRegionList(Vector<int[]> regions) {

		Vector<int[]> newRegions = new Vector<int[]>() ;

		int index = regions.size() -1 ;

		int lastPos = -1 ;

		while (index >= 0) {

			int[] region = regions.elementAt(index) ;

			if (lastPos <0 || region[1] <= lastPos) {
				newRegions.add(0, region) ;
				lastPos = region[0] ;
			}
			
			index-- ;
		}

		return newRegions ;	
	}*/

	/**
	 * Merges two lists of regions into one sorted list. Regions that are contained
	 * within other regions are discarded.
	 * 
	 * The resulting region list will be non-overlapping and sorted by end positions.
	 */
	private Vector<int[]> mergeRegionLists(Vector<int[]> regionsA, Vector<int[]> regionsB) {

		int indexA = regionsA.size() -1 ;
		int indexB = regionsB.size() - 1;

		Vector<int[]> newRegions = new Vector<int[]>() ;

		int lastPos = -1 ;

		while (indexA >= 0 && indexB >= 0) {

			int[] regionA = regionsA.elementAt(indexA) ;
			int[] regionB = regionsB.elementAt(indexB) ;

			if (lastPos >= 0 && regionA[0] >= lastPos && regionA[0] >= lastPos) {
				//both of these are inside regions that we have already dealt with, so discard them
				indexA-- ;
				indexB-- ;
			} else {
				if (regionB[1] > regionA[1]) {

					//lets see if we need to copy B across
					if ((regionB[0] >= regionA[0] && regionB[1] <= regionA[1]) || (lastPos>=0 && regionB[0] >= lastPos)) {
						//either A or the last region we dealt with completely contains B, so we just discard B
					} else {
						//deal with B now
						int[] newRegion = {regionB[0], min(regionB[1], lastPos)} ;
						newRegions.add(0, newRegion) ;
						lastPos = regionB[0] ;
					}

					indexB-- ;				
				} else {

					//lets see if we need to copy A across

					if ((regionA[0] >= regionB[0] && regionA[1] <= regionB[1]) || (lastPos>=0 && regionA[0] >= lastPos)) {
						//either B or the last region we dealt with completely contains A, so we just discard A
					} else {
						//deal with A now
						int[] newRegion = {regionA[0], min(regionA[1], lastPos)} ;
						newRegions.add(0, newRegion) ;
						lastPos = regionA[0] ;
					}

					indexA-- ;	
				}
			}
		}

		//deal with any remaining A regions
		while (indexA >= 0) {

			int[] regionA = regionsA.elementAt(indexA) ;

			if (lastPos >= 0 && regionA[0] > lastPos) {
				//this is already covered, so ignore it
			} else {
				int[] newRegion = {regionA[0], min(regionA[1], lastPos)} ;
				newRegions.add(0, newRegion) ;
				lastPos = regionA[0] ;
			}

			indexA-- ;
		}

		//deal with any remaining B regions
		while (indexB >= 0) {

			int[] regionB = regionsB.elementAt(indexB) ;

			if (lastPos >= 0 && regionB[0] > lastPos) {
				//this is already covered, so ignore it
			} else {
				int[] newRegion = {regionB[0], min(regionB[1], lastPos)} ;
				newRegions.add(0, newRegion) ;
				lastPos = regionB[0] ;
			}

			indexB-- ;
		}

		return newRegions ;
	}

	
	private int min(int a, int b) {

		if (a>=0 && b>=0) {
			return Math.min(a,b) ;
		} else {
			if (a>=0)
				return a ;
			else 
				return b ;
		}
	}
	

}