package org.apache.solr.handler.batch; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery.Builder; import org.apache.lucene.search.Query; import org.apache.lucene.search.SimpleCollector; import org.apache.lucene.search.TermQuery; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.params.SolrParams; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.SolrIndexSearcher; public class BatchProviderFindWordGroups extends BatchProvider { @Override public void run(SolrQueryRequest locReq, BatchHandlerRequestQueue queue) throws Exception { SolrParams params = locReq.getParams(); String jobid = params.get("jobid"); String workDir = params.get("#workdir"); File input = new File(workDir + "/" + jobid + ".input"); if (!input.canRead()) { throw new SolrException(ErrorCode.BAD_REQUEST, "No input data available, bark bark - " + input); } List<String> terms = readInputFile(input); final HashSet<String> termMap = new HashSet<String>(); termMap.addAll(terms); SolrIndexSearcher searcher = locReq.getSearcher(); IndexSchema schema = locReq.getCore().getLatestSchema(); File jobFile = new File(workDir + "/" + params.get("jobid")); final BufferedWriter out = new BufferedWriter(new FileWriter(jobFile), 1024*256); final int maxlen = params.getInt("maxlen", 2); final int stopAterReaching = Math.min(params.getInt("stopAfterReaching", 10000), 1000000); float upperLimit = Float.parseFloat(params.get("upperLimit", "1.0")); float lowerLimit = Float.parseFloat(params.get("lowerLimit", "0.9")); final int maxClauses = Math.min(params.getInt("maxClauses", 5), 500); assert upperLimit <= 1.0f && upperLimit > 0.0f; assert lowerLimit < upperLimit && lowerLimit >= 0.0f; final Analyzer analyzer = schema.getIndexAnalyzer(); final HashSet<String> fieldsToLoad = new HashSet<String>(); String[] fields = params.getParams("fields"); for (String f: fields) { for (String ff: f.split("( |,)")) { SchemaField field = schema.getFieldOrNull(ff); if (field==null || !field.stored()) { throw new SolrException(ErrorCode.BAD_REQUEST, "We cannot dump fields that do not exist or are not stored: " + ff); } fieldsToLoad.add(ff); } } final Map<String, Integer> collectedItems = new HashMap<String, Integer>(); while (true) { int origSize = terms.size(); if (terms.size() < 1 || collectedItems.size() > stopAterReaching) { break; } Query query = buildQuery(terms, fieldsToLoad, maxClauses); assert terms.size() < origSize; final BatchHandlerRequestQueue batchQueue = queue; searcher.search(query, new SimpleCollector() { private LeafReader reader; private int processed = 0; private CharTermAttribute termAtt; private PositionIncrementAttribute posIncrAtt; @Override public void collect(int i) throws IOException { if (processed % 10000 == 0) { if(batchQueue.isStopped()) { // inside, because queue is synchronized throw new IOException("Collector interrupted - stopping"); } } if (collectedItems.size() > stopAterReaching) { return; } Document d; d = reader.document(i, fieldsToLoad); processed++; String tokenStr; int keepAdding = -1; for (String f: fieldsToLoad) { String[] vals = d.getValues(f); posIncrAtt = null; for (String s: vals) { //System.out.println("analyzing: " + s); LinkedList<String> tokenQueue = new LinkedList<String>(); TokenStream buffer = analyzer.tokenStream(f, new StringReader(s)); if (!buffer.hasAttribute(CharTermAttribute.class)) { continue; // empty stream } termAtt = buffer.getAttribute(CharTermAttribute.class); if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); } buffer.reset(); if (posIncrAtt != null) { while (buffer.incrementToken()) { tokenStr = termAtt.toString(); if (tokenStr.trim().equals("")) continue; //System.out.println(tokenStr); if (posIncrAtt.getPositionIncrement() == 0) { if (termMap.contains(tokenStr)) { tokenQueue.removeLast(); tokenQueue.addLast(tokenStr); addEverythingLeftToRight(tokenQueue, collectedItems); keepAdding = maxlen; } continue; } if (tokenQueue.size() >= maxlen) { tokenQueue.removeFirst(); } tokenQueue.addLast(tokenStr); if (termMap.contains(tokenStr)) { addEverythingLeftToRight(tokenQueue, collectedItems); keepAdding = maxlen; } else if (keepAdding-- > 0) { addEverythingLeftToRight(tokenQueue, collectedItems); } } } else { while (buffer.incrementToken()) { tokenStr = termAtt.toString(); if (tokenStr.trim().equals("")) continue; if (tokenQueue.size() >= maxlen) { tokenQueue.removeFirst(); tokenQueue.addLast(tokenStr); } if (termMap.contains(tokenStr)) { addEverythingLeftToRight(tokenQueue, collectedItems); keepAdding = maxlen; } else if (keepAdding-- > 0) { addEverythingLeftToRight(tokenQueue, collectedItems); } } } buffer.close(); } } } private void addEverythingLeftToRight(LinkedList<String> tokenQueue, Map<String, Integer> collectedItems) { if (tokenQueue.size() == 1 || tokenQueue.size() < maxlen) return; String key = tokenQueue.get(0); for (int i=1;i<tokenQueue.size();i++) { key = key + "|" + tokenQueue.get(i); } //System.out.println("adding: " + key); if (collectedItems.containsKey(key)) { collectedItems.put(key, collectedItems.get(key)+1); } else { collectedItems.put(key, 1); } } public void doSetNextReader(LeafReaderContext context) { this.reader = context.reader(); } @Override public boolean needsScores() { return false; } }); if (collectedItems.size() > stopAterReaching) { break; } } // sort results by frequency, highest first List<Entry<String, Integer>> colVal = new ArrayList<Entry<String, Integer>>(collectedItems.size()); for (Entry<String, Integer> e: collectedItems.entrySet()) { colVal.add(e); } Collections.sort(colVal, new Comparator<Entry<String, Integer>>() { @Override public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { int f = o2.getValue().compareTo(o1.getValue()); if (f == 0) return o1.getKey().compareTo(o2.getKey()); return f; }; }); int upperL = upperLimit == 1.0f ? 0 : colVal.size() - Math.round(colVal.size() * upperLimit); int lowerL = lowerLimit == 0.0f ? colVal.size() : colVal.size() - Math.round(colVal.size() * lowerLimit); for (int i=upperL; i < colVal.size() && i < lowerL; i++) { Entry<String, Integer> entry = colVal.get(i); out.write(entry.getKey()); out.write("\t"); out.write(Integer.toString(entry.getValue())); out.write("\n"); } out.close(); } private Query buildQuery(List<String> terms, HashSet<String> fieldsToLoad, int maxClauses) { ArrayList<String> toRemove = new ArrayList<String>(); Builder bq = new BooleanQuery.Builder(); String ff = ""; if (fieldsToLoad.size() > 0) { for (String x: fieldsToLoad) { ff = x; } } for (int i=0;i<terms.size() && i<maxClauses;i++) { if (fieldsToLoad.size() > 1) { Builder bbq; bbq = new BooleanQuery.Builder(); for (String f: fieldsToLoad) { bbq.add(new BooleanClause(new TermQuery(new Term(f, terms.get(i))), Occur.SHOULD)); } bq.add(bbq.build(), Occur.SHOULD); } else { bq.add(new BooleanClause(new TermQuery(new Term(ff, terms.get(i))), Occur.SHOULD)); } toRemove.add(terms.get(i)); } for (String t: toRemove) { terms.remove(t); } return bq.build(); } private List<String> readInputFile(File input) throws IOException { ArrayList<String> out = new ArrayList<String>(); BufferedReader br = new BufferedReader(new FileReader(input)); String line; while ((line = br.readLine()) != null) { line = line.toLowerCase().trim(); out.add(line); } br.close(); return out; } @Override public String getDescription() { return "Takes list of terms and find their complements"; } }