package org.juxtasoftware.service; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.regex.Pattern; import org.juxtasoftware.dao.JuxtaAnnotationDao; import org.juxtasoftware.diff.Token; import org.juxtasoftware.diff.TokenSource; import org.juxtasoftware.diff.impl.SimpleToken; import org.juxtasoftware.model.AnnotationConstraint; import org.juxtasoftware.model.CollatorConfig.HyphenationFilter; import org.juxtasoftware.model.JuxtaAnnotation; import org.juxtasoftware.model.QNameFilter; import eu.interedition.text.Range; import eu.interedition.text.Text; /** * Source of token annotations to be used by the diff collator */ public class RepositoryTokenSource implements TokenSource { private final TokenizerConfiguration config; private final JuxtaAnnotationDao annotationDao; private final QNameFilter tokenFilter; private final Long setId; // anything thats not alnum or hyphen is consdered punctuation here private static final Pattern PUNCTUATION = Pattern.compile("[^a-zA-Z0-9\\-]"); public RepositoryTokenSource( TokenizerConfiguration config, Long setId, JuxtaAnnotationDao annoDao, QNameFilter tokenFilter) { this.config = config; this.annotationDao = annoDao; this.tokenFilter = tokenFilter; this.setId = setId; } @Override public List<Token> tokensOf(Text text, Set<Range> ranges) throws IOException { List<Token> tokens = new ArrayList<Token>(); AnnotationConstraint constraint = new AnnotationConstraint(this.setId, text); constraint.setFilter( this.tokenFilter ); for (Range r : ranges ) { constraint.addRange(r); } constraint.setIncludeText(true); List<JuxtaAnnotation> annos = this.annotationDao.list(constraint) ; for ( JuxtaAnnotation anno : annos ) { String tokenText = anno.getContent(); if ( this.config.getHyphenationFilter().equals(HyphenationFilter.FILTER_ALL) ) { if ( tokenText.contains("-")) { String[] bits = tokenText.split("-"); if (bits.length == 2) { tokenText = bits[0].trim() + bits[1].trim(); } } } else if ( this.config.getHyphenationFilter().equals(HyphenationFilter.FILTER_LINEBREAK) ) { if ( tokenText.contains("-") && (tokenText.indexOf(10) > -1) || tokenText.indexOf(13) > -1) { String[] bits = tokenText.split("-"); if (bits.length == 2) { tokenText = bits[0].trim() + bits[1].trim(); } } } if ( this.config.isFilterCase()) { tokenText = tokenText.toLowerCase(); } if ( this.config.getHyphenationFilter().equals(HyphenationFilter.FILTER_ALL)) { tokenText = tokenText.replaceAll("-", ""); } if ( this.config.isFilterPunctuation() ) { tokenText = PUNCTUATION.matcher(tokenText).replaceAll(""); } if ( this.config.isFilterWhitespace() ) { tokenText = tokenText.trim().replaceAll("\\s+", " "); } if ( tokenText.length() > 0 ) { tokens.add(new SimpleToken(anno, tokenText)); } } return tokens; } }