/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.searcher; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.io.BufferedReader; import java.io.InputStreamReader; import java.util.Arrays; import java.util.ArrayList; // Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Writable; import org.apache.nutch.analysis.AnalyzerFactory; import org.apache.nutch.analysis.NutchAnalysis; import org.apache.nutch.util.NutchConfiguration; /** A Nutch query. */ public final class Query implements Writable, Cloneable, Configurable { public static final Log LOG = LogFactory.getLog(Query.class); /** A query clause. */ public static class Clause implements Cloneable { public static final String DEFAULT_FIELD = "DEFAULT"; private static final byte REQUIRED_BIT = 1; private static final byte PROHIBITED_BIT = 2; private static final byte PHRASE_BIT = 4; private boolean isRequired; private boolean isProhibited; private String field = DEFAULT_FIELD; private float weight = 1.0f; private Object termOrPhrase; private Configuration conf; public Clause(Term term, String field, boolean isRequired, boolean isProhibited, Configuration conf) { this(term, isRequired, isProhibited, conf); this.field = field; } public Clause(Term term, boolean isRequired, boolean isProhibited, Configuration conf) { this.isRequired = isRequired; this.isProhibited = isProhibited; this.termOrPhrase = term; this.conf = conf; } public Clause(Phrase phrase, String field, boolean isRequired, boolean isProhibited, Configuration conf) { this(phrase, isRequired, isProhibited, conf); this.field = field; } public Clause(Phrase phrase, boolean isRequired, boolean isProhibited, Configuration conf) { this.isRequired = isRequired; this.isProhibited = isProhibited; this.termOrPhrase = phrase; this.conf = conf; } public boolean isRequired() { return isRequired; } public boolean isProhibited() { return isProhibited; } public String getField() { return field; } public float getWeight() { return weight; } public void setWeight(float weight) { this.weight = weight; } public boolean isPhrase() { return termOrPhrase instanceof Phrase; } public Phrase getPhrase() { return (Phrase)termOrPhrase; } public Term getTerm() { return (Term)termOrPhrase; } public void write(DataOutput out) throws IOException { byte bits = 0; if (isPhrase()) bits |= PHRASE_BIT; if (isRequired) bits |= REQUIRED_BIT; if (isProhibited) bits |= PROHIBITED_BIT; out.writeByte(bits); out.writeUTF(field); out.writeFloat(weight); if (isPhrase()) getPhrase().write(out); else getTerm().write(out); } public static Clause read(DataInput in, Configuration conf) throws IOException { byte bits = in.readByte(); boolean required = ((bits & REQUIRED_BIT) != 0); boolean prohibited = ((bits & PROHIBITED_BIT) != 0); String field = in.readUTF(); float weight = in.readFloat(); Clause clause; if ((bits & PHRASE_BIT) == 0) { clause = new Clause(Term.read(in), field, required, prohibited, conf); } else { clause = new Clause(Phrase.read(in), field, required, prohibited, conf); } clause.weight = weight; return clause; } public String toString() { StringBuffer buffer = new StringBuffer(); // if (isRequired) // buffer.append("+"); // else if (isProhibited) buffer.append ("-"); if (!DEFAULT_FIELD.equals(field)) { buffer.append(field); buffer.append(":"); } if (!isPhrase() && new QueryFilters(conf).isRawField(field)) { buffer.append('"'); // quote raw terms buffer.append(termOrPhrase.toString()); buffer.append('"'); } else { buffer.append(termOrPhrase.toString()); } return buffer.toString(); } public boolean equals(Object o) { if (!(o instanceof Clause)) return false; Clause other = (Clause)o; return (this.isRequired == other.isRequired) && (this.isProhibited == other.isProhibited) && (this.weight == other.weight) && (this.termOrPhrase == null ? other.termOrPhrase == null : this.termOrPhrase.equals(other.termOrPhrase)); } public int hashCode() { return (this.isRequired ? 0 : 1) ^ (this.isProhibited ? 2 : 4) ^ Float.floatToIntBits(this.weight) ^ (this.termOrPhrase != null ? termOrPhrase.hashCode() : 0); } public Object clone() { try { return super.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } } } /** A single-term query clause. */ public static class Term { private String text; public Term(String text) { this.text = text; } public void write(DataOutput out) throws IOException { out.writeUTF(text); } public static Term read(DataInput in) throws IOException { String text = in.readUTF(); return new Term(text); } public String toString() { return text; } public boolean equals(Object o) { if (!(o instanceof Term)) return false; Term other = (Term)o; return text == null ? other.text == null : text.equals(other.text); } public int hashCode() { return text != null ? text.hashCode() : 0; } } /** A phrase query clause. */ public static class Phrase { private Term[] terms; public Phrase(Term[] terms) { this.terms = terms; } public Phrase(String[] terms) { this.terms = new Term[terms.length]; for (int i = 0; i < terms.length; i++) { this.terms[i] = new Term(terms[i]); } } public Term[] getTerms() { return terms; } public void write(DataOutput out) throws IOException { out.writeByte(terms.length); for (int i = 0; i < terms.length; i++) terms[i].write(out); } public static Phrase read(DataInput in) throws IOException { int length = in.readByte(); Term[] terms = new Term[length]; for (int i = 0; i < length; i++) terms[i] = Term.read(in); return new Phrase(terms); } public String toString() { StringBuffer buffer = new StringBuffer(); buffer.append("\""); for (int i = 0; i < terms.length; i++) { buffer.append(terms[i].toString()); if (i != terms.length-1) buffer.append(" "); } buffer.append("\""); return buffer.toString(); } public boolean equals(Object o) { if (!(o instanceof Phrase)) return false; Phrase other = (Phrase)o; if (!(this.terms.length == this.terms.length)) return false; for (int i = 0; i < terms.length; i++) { if (!this.terms[i].equals(other.terms[i])) return false; } return true; } public int hashCode() { int hashCode = terms.length; for (int i = 0; i < terms.length; i++) { hashCode ^= terms[i].hashCode(); } return hashCode; } } private ArrayList<Clause> clauses = new ArrayList<Clause>(); private Configuration conf; private static final Clause[] CLAUSES_PROTO = new Clause[0]; public Query() { } public Query(Configuration conf) { this.conf = conf; } public void setConf(Configuration conf) { this.conf = conf; } public Configuration getConf() { return conf; } /** Return all clauses. */ public Clause[] getClauses() { return clauses.toArray(CLAUSES_PROTO); } /** Add a required term in the default field. */ public void addRequiredTerm(String term) { addRequiredTerm(term, Clause.DEFAULT_FIELD); } /** Add a required term in a specified field. */ public void addRequiredTerm(String term, String field) { clauses.add(new Clause(new Term(term), field, true, false, this.conf)); } /** Add a prohibited term in the default field. */ public void addProhibitedTerm(String term) { addProhibitedTerm(term, Clause.DEFAULT_FIELD); } /** Add a prohibited term in the specified field. */ public void addProhibitedTerm(String term, String field) { clauses.add(new Clause(new Term(term), field, false, true, this.conf)); } /** Add a required phrase in the default field. */ public void addRequiredPhrase(String[] terms) { addRequiredPhrase(terms, Clause.DEFAULT_FIELD); } /** Add a required phrase in the specified field. */ public void addRequiredPhrase(String[] terms, String field) { if (terms.length == 0) { // ignore empty phrase } else if (terms.length == 1) { addRequiredTerm(terms[0], field); // optimize to term query } else { clauses.add(new Clause(new Phrase(terms), field, true, false, this.conf)); } } /** Add a prohibited phrase in the default field. */ public void addProhibitedPhrase(String[] terms) { addProhibitedPhrase(terms, Clause.DEFAULT_FIELD); } /** Add a prohibited phrase in the specified field. */ public void addProhibitedPhrase(String[] terms, String field) { if (terms.length == 0) { // ignore empty phrase } else if (terms.length == 1) { addProhibitedTerm(terms[0], field); // optimize to term query } else { clauses.add(new Clause(new Phrase(terms), field, false, true, this.conf)); } } public void write(DataOutput out) throws IOException { out.writeByte(clauses.size()); for (int i = 0; i < clauses.size(); i++) clauses.get(i).write(out); } public static Query read(DataInput in, Configuration conf) throws IOException { Query result = new Query(conf); result.readFields(in); return result; } public void readFields(DataInput in) throws IOException { clauses.clear(); int length = in.readByte(); for (int i = 0; i < length; i++) clauses.add(Clause.read(in, this.conf)); } public String toString() { StringBuffer buffer = new StringBuffer(); for (int i = 0; i < clauses.size(); i++) { buffer.append(clauses.get(i).toString()); if (i != clauses.size()-1) buffer.append(" "); } return buffer.toString(); } public boolean equals(Object o) { if (!(o instanceof Query)) return false; Query other = (Query)o; return this.clauses.equals(other.clauses); } public int hashCode() { return this.clauses.hashCode(); } public Object clone() { Query clone = null; try { clone = (Query)super.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } clone.clauses = (ArrayList<Clause>)clauses.clone(); return clone; } /** Flattens a query into the set of text terms that it contains. These are * terms which should be higlighted in matching documents. */ public String[] getTerms() { ArrayList<String> result = new ArrayList<String>(); for (int i = 0; i < clauses.size(); i++) { Clause clause = clauses.get(i); if (!clause.isProhibited()) { if (clause.isPhrase()) { Term[] terms = clause.getPhrase().getTerms(); for (int j = 0; j < terms.length; j++) { result.add(terms[j].toString()); } } else { result.add(clause.getTerm().toString()); } } } return result.toArray(new String[result.size()]); } /** * Parse a query from a string using a language specific analyzer. * * @param queryString is the raw query string to parse * @param queryLang is a two-letters language code used to identify which * {@link org.apache.nutch.analysis.NutchAnalyzer} should be used * to parse the query string. * @see org.apache.nutch.analysis.AnalyzerFactory */ public static Query parse(String queryString, String queryLang, Configuration conf) throws IOException { return fixup(NutchAnalysis.parseQuery( queryString, AnalyzerFactory.get(conf).get(queryLang), conf), conf); } /** Parse a query from a string. */ public static Query parse(String queryString, Configuration conf) throws IOException { return parse(queryString, null, conf); } /** Convert clauses in unknown fields to the default field. */ private static Query fixup(Query input, Configuration conf) { // walk the query Query output = new Query(conf); Clause[] clauses = input.getClauses(); for (int i = 0; i < clauses.length; i++) { Clause c = clauses[i]; if (!new QueryFilters(conf).isField(c.getField())) { // unknown field ArrayList<Term> terms = new ArrayList<Term>(); // add name to query if (c.isPhrase()) { terms.addAll(Arrays.asList(c.getPhrase().getTerms())); } else { terms.add(c.getTerm()); } terms.add(0, new Term(c.getField())); // add to front of phrase c = (Clause)c.clone(); c.field = Clause.DEFAULT_FIELD; // use default field instead c.termOrPhrase = new Phrase(terms.toArray(new Term[terms.size()])); } output.clauses.add(c); // copy clause to output } return output; } /** For debugging. */ public static void main(String[] args) throws Exception { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); Configuration conf = NutchConfiguration.create(); while (true) { System.out.print("Query: "); String line = in.readLine(); Query query = parse(line, conf); System.out.println("Parsed: " + query); System.out.println("Translated: " + new QueryFilters(conf).filter(query)); } } }