package de.isabeldrostfromm.sof.termvector; import java.util.ArrayList; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.mahout.math.map.OpenObjectDoubleHashMap; import org.elasticsearch.common.collect.Sets; import com.google.common.base.Preconditions; import com.google.gson.internal.StringMap; import de.isabeldrostfromm.sof.Example; import de.isabeldrostfromm.sof.ProviderIterator; public class RESTProviderIterator extends ProviderIterator { @SuppressWarnings("rawtypes") private ArrayList<StringMap> hits; private int cursor = -1; private RESTProvider connection; private Vectoriser v = new Vectoriser(); @SuppressWarnings({ "rawtypes", "unchecked" }) public RESTProviderIterator(Map result, RESTProvider connection) { Preconditions.checkNotNull(result); if (result.isEmpty()) { this.hits = new ArrayList<StringMap>(); } else { StringMap obj_1 = (StringMap) result.get("hits"); if (obj_1 == null) System.out.println(result); hits = (ArrayList<StringMap>) obj_1.get("hits"); } this.connection = connection; } @SuppressWarnings({ "rawtypes", "unchecked" }) @Override protected Example parse() { if ( (cursor + 1) < hits.size()) { cursor++; StringMap entry = hits.get(cursor); String id = (String) (String) entry.get("_id"); Map doc = connection.getTermVectors(id); StringMap<StringMap> termVectors = (StringMap) doc.get("term_vectors"); OpenObjectDoubleHashMap<String> titleParse = new OpenObjectDoubleHashMap<String>(); StringMap<StringMap> titleVector = (StringMap) termVectors.get("title"); StringMap<StringMap> titleTerms = (StringMap) titleVector.get("terms"); for (Entry<String, StringMap> titleTerm : titleTerms.entrySet()) { String term = titleTerm.getKey(); StringMap<Double> freqEntry = titleTerm.getValue(); double freq = (Double) freqEntry.get("term_freq"); titleParse.put(term, freq); } StringMap<StringMap> bodyVectors = (StringMap) termVectors.get("body"); StringMap<StringMap> bodyTerms = (StringMap) bodyVectors.get("terms"); OpenObjectDoubleHashMap<String> bodyParse = new OpenObjectDoubleHashMap<String>(); for (Entry<String, StringMap> bodyTerm : bodyTerms.entrySet()) { String term = bodyTerm.getKey(); StringMap<Double> freqEntry = bodyTerm.getValue(); double freq = (Double) freqEntry.get("term_freq"); bodyParse.put(term, freq); } Map docMeta = connection.getMetaData(id); StringMap<String> source = (StringMap)docMeta.get("_source"); String state = source.get("open_status"); String reputation = source.get("reputation_at_post_creation"); String tag1 = source.get("tag_1"); String tag2 = source.get("tag_2"); String tag3 = source.get("tag_3"); String tag4 = source.get("tag_4"); String tag5 = source.get("tag_5"); Set<String> tags = Sets.newHashSet(tag1, tag2, tag3, tag4, tag5); ParsedDocument parsedDoc = ParsedDocument.of(bodyParse, state, titleParse, Double.parseDouble(reputation), tags); return Example.of(v.vectorise(parsedDoc), state); } return null; } }