/**
*
*/
package uk.bl.wa.nlp.analysers;
/*
* #%L
* warc-indexer
* %%
* Copyright (C) 2013 - 2014 The UK Web Archive
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/gpl-2.0.html>.
* #L%
*/
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.tika.metadata.Metadata;
import com.typesafe.config.Config;
import uk.bl.wa.analyser.text.AbstractTextAnalyser;
import uk.bl.wa.nlp.parsers.StanfordAnnotatorParser;
import uk.bl.wa.solr.SolrFields;
import uk.bl.wa.solr.SolrRecord;
/**
* @author anj
*
*/
public class StanfordAnalyser extends AbstractTextAnalyser {
StanfordAnnotatorParser parser = new StanfordAnnotatorParser();
public static final int MAX_CHARS_TO_ANALYSE = 10000;
/**
* @param conf
*/
public StanfordAnalyser(Config conf) {
}
/* (non-Javadoc)
* @see uk.bl.wa.analyser.text.TextAnalyser#analyse(java.lang.String, uk.bl.wa.util.solr.SolrRecord)
*/
@Override
public void analyse(String text, SolrRecord solr) {
int sentilen = MAX_CHARS_TO_ANALYSE;
if( sentilen > text.length() )
sentilen = text.length();
String sentitext = text.substring( 0, sentilen );
Metadata metadata = new Metadata();
parser.parse(sentitext, metadata);
Set<String> persons = new HashSet<String>(Arrays.asList(metadata.getValues(StanfordAnnotatorParser.NER_PERSONS)));
System.out.println("PERSONS: "+persons);
Set<String> orgs = new HashSet<String>(Arrays.asList(metadata.getValues(StanfordAnnotatorParser.NER_ORGANISATIONS)));
System.out.println("ORGANIZATIONS: "+orgs);
Set<String> locs = new HashSet<String>(Arrays.asList(metadata.getValues(StanfordAnnotatorParser.NER_LOCATIONS)));
System.out.println("LOCATIONS: "+locs);
Set<String> dates = new HashSet<String>(Arrays.asList(metadata.getValues(StanfordAnnotatorParser.NER_DATES)));
System.out.println("DATES: "+dates);
Set<String> misc = new HashSet<String>(Arrays.asList(metadata.getValues(StanfordAnnotatorParser.NER_MISC)));
System.out.println("MISC: "+misc);
/* And sentiments */
String sentiment = metadata.get(StanfordAnnotatorParser.AVG_SENTIMENT);
System.out.println("Sentiment: "+sentiment);
solr.addField( SolrFields.SENTIMENT, sentiment );
List<String> sentiments = Arrays.asList(metadata.getValues(StanfordAnnotatorParser.SENTIMENT_DIST));
System.out.println("Sentiments: "+sentiments);
}
}