RepositoryTokenSource.java example

Explorer
juxta-service-master
package org.juxtasoftware.service;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

import org.juxtasoftware.dao.JuxtaAnnotationDao;
import org.juxtasoftware.diff.Token;
import org.juxtasoftware.diff.TokenSource;
import org.juxtasoftware.diff.impl.SimpleToken;
import org.juxtasoftware.model.AnnotationConstraint;
import org.juxtasoftware.model.CollatorConfig.HyphenationFilter;
import org.juxtasoftware.model.JuxtaAnnotation;
import org.juxtasoftware.model.QNameFilter;

import eu.interedition.text.Range;
import eu.interedition.text.Text;

/**
 * Source of token annotations to be used by the diff collator
 */
public class RepositoryTokenSource implements TokenSource {
    private final TokenizerConfiguration config;
    private final JuxtaAnnotationDao annotationDao;
    private final QNameFilter tokenFilter;
    private final Long setId;
    
    // anything thats not alnum or hyphen is consdered punctuation here
    private static final Pattern PUNCTUATION = Pattern.compile("[^a-zA-Z0-9\\-]");

    public RepositoryTokenSource( TokenizerConfiguration config, Long setId, JuxtaAnnotationDao annoDao, QNameFilter tokenFilter) {
        this.config = config;
        this.annotationDao = annoDao;
        this.tokenFilter = tokenFilter;
        this.setId = setId;
    }

    @Override
    public List<Token> tokensOf(Text text, Set<Range> ranges) throws IOException {        
        List<Token> tokens = new ArrayList<Token>();
        AnnotationConstraint constraint = new AnnotationConstraint(this.setId, text);
        constraint.setFilter( this.tokenFilter );
        for (Range r : ranges ) {
            constraint.addRange(r);
        }
        constraint.setIncludeText(true);
        List<JuxtaAnnotation> annos = this.annotationDao.list(constraint) ;
        for ( JuxtaAnnotation anno : annos ) {
            String tokenText = anno.getContent();
            
            if ( this.config.getHyphenationFilter().equals(HyphenationFilter.FILTER_ALL)  ) {
                if ( tokenText.contains("-")) {
                    String[] bits = tokenText.split("-");
                    if (bits.length == 2) {
                        tokenText = bits[0].trim() + bits[1].trim();
                    }
                }
            } else if ( this.config.getHyphenationFilter().equals(HyphenationFilter.FILTER_LINEBREAK)  ) {
                if ( tokenText.contains("-") && (tokenText.indexOf(10) > -1) || tokenText.indexOf(13) > -1) {
                    String[] bits = tokenText.split("-");
                    if (bits.length == 2) {
                        tokenText = bits[0].trim() + bits[1].trim();
                    }
                }   
            }
            
            if ( this.config.isFilterCase()) {
                tokenText = tokenText.toLowerCase();
            }
            if ( this.config.getHyphenationFilter().equals(HyphenationFilter.FILTER_ALL)) {
                tokenText = tokenText.replaceAll("-", "");
            }
            if ( this.config.isFilterPunctuation() ) {
                tokenText = PUNCTUATION.matcher(tokenText).replaceAll("");
            }
            if ( this.config.isFilterWhitespace() ) {
                tokenText = tokenText.trim().replaceAll("\\s+", " ");
            }
            
            if ( tokenText.length() > 0 ) {
                tokens.add(new SimpleToken(anno, tokenText));
            }
        }
        return tokens;
    }
}