/* Generated By:JavaCC: Do not edit this line. NutchAnalysis.java */ package org.apache.nutch.analysis; import java.io.StringReader; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.searcher.Query; import org.apache.nutch.searcher.QueryFilters; import org.apache.nutch.searcher.Query.Clause; import org.apache.nutch.util.NutchConfiguration; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import java.io.*; import java.util.*; /** The JavaCC-generated Nutch lexical analyzer and query parser. */ public class NutchAnalysis implements NutchAnalysisConstants { private static final String[] STOP_WORDS = { "a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS); private Analyzer analyzer = null; private String queryString; private QueryFilters queryFilters; /** Constructs a nutch analysis. */ public NutchAnalysis(String query, Analyzer analyzer) { this(new FastCharStream(new StringReader(query))); this.analyzer = analyzer; } /** True iff word is a stop word. Stop words are only removed from queries. * Every word is indexed. */ public static boolean isStopWord(String word) { return STOP_SET.contains(word); } /** Construct a query parser for the text in a reader. */ public static Query parseQuery(String queryString, Configuration conf) throws IOException { return parseQuery(queryString, null, conf); } /** Construct a query parser for the text in a reader. */ public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf) throws IOException { NutchAnalysis parser = new NutchAnalysis( queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf)); parser.queryString = queryString; parser.queryFilters = new QueryFilters(conf); return parser.parse(conf); } /** For debugging. */ public static void main(String[] args) throws Exception { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); while (true) { System.out.print("Query: "); String line = in.readLine(); System.out.println(parseQuery(line, NutchConfiguration.create())); } } /** Parse a query. */ final public Query parse(Configuration conf) throws ParseException { Query query = new Query(conf); ArrayList<String> terms; Token token; String field; boolean stop; boolean prohibited; nonOpOrTerm(); label_1: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case WORD: case ACRONYM: case SIGRAM: case PLUS: case MINUS: case QUOTE: ; break; default: jj_la1[0] = jj_gen; break label_1; } stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: case MINUS: switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: jj_consume_token(PLUS); stop=false; break; case MINUS: jj_consume_token(MINUS); stop=false;prohibited=true; break; default: jj_la1[1] = jj_gen; jj_consume_token(-1); throw new ParseException(); } break; default: jj_la1[2] = jj_gen; ; } if (jj_2_1(2147483647)) { token = jj_consume_token(WORD); jj_consume_token(COLON); field = token.image; } else { ; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case QUOTE: terms = phrase(field); stop=false; break; case WORD: case ACRONYM: case SIGRAM: // quoted terms or terms = compound(field); break; default: jj_la1[3] = jj_gen; jj_consume_token(-1); throw new ParseException(); } nonOpOrTerm(); String[] array = terms.toArray(new String[terms.size()]); if (stop && field == Clause.DEFAULT_FIELD && terms.size()==1 && isStopWord(array[0])) { // ignore stop words only when single, unadorned terms in default field } else { if (prohibited) query.addProhibitedPhrase(array, field); else query.addRequiredPhrase(array, field); } } {if (true) return query;} throw new Error("Missing return statement in function"); } /** Parse an explcitly quoted phrase query. Note that this may return a single * term, a trivial phrase.*/ final public ArrayList<String> phrase(String field) throws ParseException { int start; int end; ArrayList<String> result = new ArrayList<String>(); String term; jj_consume_token(QUOTE); start = token.endColumn; label_2: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: case MINUS: case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: case WHITE: ; break; default: jj_la1[4] = jj_gen; break label_2; } nonTerm(); } label_3: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case WORD: case ACRONYM: case SIGRAM: ; break; default: jj_la1[5] = jj_gen; break label_3; } term = term(); result.add(term); label_4: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: case MINUS: case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: case WHITE: ; break; default: jj_la1[6] = jj_gen; break label_4; } nonTerm(); } } end = token.endColumn; switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case QUOTE: jj_consume_token(QUOTE); break; case 0: jj_consume_token(0); break; default: jj_la1[7] = jj_gen; jj_consume_token(-1); throw new ParseException(); } if (this.queryFilters.isRawField(field)) { result.clear(); result.add(queryString.substring(start, end)); } {if (true) return result;} throw new Error("Missing return statement in function"); } /** Parse a compound term that is interpreted as an implicit phrase query. * Compounds are a sequence of terms separated by infix characters. Note that * htis may return a single term, a trivial compound. */ final public ArrayList<String> compound(String field) throws ParseException { int start; ArrayList<String> result = new ArrayList<String>(); String term; StringBuffer terms = new StringBuffer(); start = token.endColumn; term = term(); terms.append(term).append(" "); //result.add(term); label_5: while (true) { if (jj_2_2(2147483647)) { ; } else { break label_5; } label_6: while (true) { infix(); switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: case MINUS: case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: ; break; default: jj_la1[8] = jj_gen; break label_6; } } term = term(); terms.append(term).append(" "); //result.add(term); } if (this.queryFilters.isRawField(field)) { // result.clear(); result.add(queryString.substring(start, token.endColumn)); } else { org.apache.lucene.analysis.Token token; TokenStream tokens = analyzer.tokenStream( field, new StringReader(terms.toString())); while (true) { try { token = tokens.next(); } catch (IOException e) { token = null; } if (token == null) { break; } result.add(token.termText()); } try { tokens.close(); } catch (IOException e) { // ignore } } {if (true) return result;} throw new Error("Missing return statement in function"); } /** Parse a single term. */ final public String term() throws ParseException { Token token; switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case WORD: token = jj_consume_token(WORD); break; case ACRONYM: token = jj_consume_token(ACRONYM); break; case SIGRAM: token = jj_consume_token(SIGRAM); break; default: jj_la1[9] = jj_gen; jj_consume_token(-1); throw new ParseException(); } {if (true) return token.image;} throw new Error("Missing return statement in function"); } /** Parse anything but a term or a quote. */ final public void nonTerm() throws ParseException { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case WHITE: jj_consume_token(WHITE); break; case PLUS: case MINUS: case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: infix(); break; default: jj_la1[10] = jj_gen; jj_consume_token(-1); throw new ParseException(); } } final public void nonTermOrEOF() throws ParseException { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: case MINUS: case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: case WHITE: nonTerm(); break; case 0: jj_consume_token(0); break; default: jj_la1[11] = jj_gen; jj_consume_token(-1); throw new ParseException(); } } /** Parse anything but a term or an operator (plur or minus or quote). */ final public void nonOpOrTerm() throws ParseException { label_7: while (true) { if (jj_2_3(2)) { ; } else { break label_7; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case WHITE: jj_consume_token(WHITE); break; case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: nonOpInfix(); break; case PLUS: case MINUS: switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: jj_consume_token(PLUS); break; case MINUS: jj_consume_token(MINUS); break; default: jj_la1[12] = jj_gen; jj_consume_token(-1); throw new ParseException(); } nonTermOrEOF(); break; default: jj_la1[13] = jj_gen; jj_consume_token(-1); throw new ParseException(); } } } /** Characters which can be used to form compound terms. */ final public void infix() throws ParseException { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: jj_consume_token(PLUS); break; case MINUS: jj_consume_token(MINUS); break; case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: nonOpInfix(); break; default: jj_la1[14] = jj_gen; jj_consume_token(-1); throw new ParseException(); } } /** Parse infix characters except plus and minus. */ final public void nonOpInfix() throws ParseException { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case COLON: jj_consume_token(COLON); break; case SLASH: jj_consume_token(SLASH); break; case DOT: jj_consume_token(DOT); break; case ATSIGN: jj_consume_token(ATSIGN); break; case APOSTROPHE: jj_consume_token(APOSTROPHE); break; default: jj_la1[15] = jj_gen; jj_consume_token(-1); throw new ParseException(); } } final private boolean jj_2_1(int xla) { jj_la = xla; jj_lastpos = jj_scanpos = token; try { return !jj_3_1(); } catch(LookaheadSuccess ls) { return true; } finally { jj_save(0, xla); } } final private boolean jj_2_2(int xla) { jj_la = xla; jj_lastpos = jj_scanpos = token; try { return !jj_3_2(); } catch(LookaheadSuccess ls) { return true; } finally { jj_save(1, xla); } } final private boolean jj_2_3(int xla) { jj_la = xla; jj_lastpos = jj_scanpos = token; try { return !jj_3_3(); } catch(LookaheadSuccess ls) { return true; } finally { jj_save(2, xla); } } final private boolean jj_3_1() { if (jj_scan_token(WORD)) return true; if (jj_scan_token(COLON)) return true; Token xsp; xsp = jj_scanpos; if (jj_3R_8()) { jj_scanpos = xsp; if (jj_3R_9()) return true; } return false; } final private boolean jj_3R_16() { Token xsp; xsp = jj_scanpos; if (jj_scan_token(7)) { jj_scanpos = xsp; if (jj_scan_token(8)) { jj_scanpos = xsp; if (jj_3R_22()) return true; } } return false; } final private boolean jj_3_3() { Token xsp; xsp = jj_scanpos; if (jj_scan_token(15)) { jj_scanpos = xsp; if (jj_3R_12()) { jj_scanpos = xsp; if (jj_3R_13()) return true; } } return false; } final private boolean jj_3R_25() { if (jj_3R_24()) return true; return false; } final private boolean jj_3R_27() { if (jj_3R_16()) return true; return false; } final private boolean jj_3R_20() { if (jj_3R_11()) return true; Token xsp; while (true) { xsp = jj_scanpos; if (jj_3R_25()) { jj_scanpos = xsp; break; } } return false; } final private boolean jj_3R_10() { if (jj_3R_16()) return true; return false; } final private boolean jj_3R_19() { if (jj_3R_24()) return true; return false; } final private boolean jj_3_2() { Token xsp; if (jj_3R_10()) return true; while (true) { xsp = jj_scanpos; if (jj_3R_10()) { jj_scanpos = xsp; break; } } if (jj_3R_11()) return true; return false; } final private boolean jj_3R_23() { if (jj_3R_24()) return true; return false; } final private boolean jj_3R_18() { Token xsp; xsp = jj_scanpos; if (jj_3R_23()) { jj_scanpos = xsp; if (jj_scan_token(0)) return true; } return false; } final private boolean jj_3R_13() { Token xsp; xsp = jj_scanpos; if (jj_scan_token(7)) { jj_scanpos = xsp; if (jj_scan_token(8)) return true; } if (jj_3R_18()) return true; return false; } final private boolean jj_3R_9() { if (jj_3R_15()) return true; return false; } final private boolean jj_3R_14() { if (jj_scan_token(QUOTE)) return true; Token xsp; while (true) { xsp = jj_scanpos; if (jj_3R_19()) { jj_scanpos = xsp; break; } } while (true) { xsp = jj_scanpos; if (jj_3R_20()) { jj_scanpos = xsp; break; } } xsp = jj_scanpos; if (jj_scan_token(9)) { jj_scanpos = xsp; if (jj_scan_token(0)) return true; } return false; } final private boolean jj_3R_24() { Token xsp; xsp = jj_scanpos; if (jj_scan_token(15)) { jj_scanpos = xsp; if (jj_3R_27()) return true; } return false; } final private boolean jj_3R_26() { if (jj_3R_16()) return true; return false; } final private boolean jj_3R_21() { Token xsp; if (jj_3R_26()) return true; while (true) { xsp = jj_scanpos; if (jj_3R_26()) { jj_scanpos = xsp; break; } } if (jj_3R_11()) return true; return false; } final private boolean jj_3R_22() { if (jj_3R_17()) return true; return false; } final private boolean jj_3R_8() { if (jj_3R_14()) return true; return false; } final private boolean jj_3R_12() { if (jj_3R_17()) return true; return false; } final private boolean jj_3R_11() { Token xsp; xsp = jj_scanpos; if (jj_scan_token(1)) { jj_scanpos = xsp; if (jj_scan_token(2)) { jj_scanpos = xsp; if (jj_scan_token(3)) return true; } } return false; } final private boolean jj_3R_15() { if (jj_3R_11()) return true; Token xsp; while (true) { xsp = jj_scanpos; if (jj_3R_21()) { jj_scanpos = xsp; break; } } return false; } final private boolean jj_3R_17() { Token xsp; xsp = jj_scanpos; if (jj_scan_token(10)) { jj_scanpos = xsp; if (jj_scan_token(11)) { jj_scanpos = xsp; if (jj_scan_token(12)) { jj_scanpos = xsp; if (jj_scan_token(13)) { jj_scanpos = xsp; if (jj_scan_token(14)) return true; } } } } return false; } public NutchAnalysisTokenManager token_source; public Token token, jj_nt; private int jj_ntk; private Token jj_scanpos, jj_lastpos; private int jj_la; public boolean lookingAhead = false; private int jj_gen; final private int[] jj_la1 = new int[16]; static private int[] jj_la1_0; static { jj_la1_0(); } private static void jj_la1_0() { jj_la1_0 = new int[] {0x38e,0x180,0x180,0x20e,0xfd80,0xe,0xfd80,0x201,0x7d80,0xe,0xfd80,0xfd81,0x180,0xfd80,0x7d80,0x7c00,}; } final private JJCalls[] jj_2_rtns = new JJCalls[3]; private boolean jj_rescan = false; private int jj_gc = 0; public NutchAnalysis(CharStream stream) { token_source = new NutchAnalysisTokenManager(stream); token = new Token(); jj_ntk = -1; jj_gen = 0; for (int i = 0; i < 16; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } public void ReInit(CharStream stream) { token_source.ReInit(stream); token = new Token(); jj_ntk = -1; jj_gen = 0; for (int i = 0; i < 16; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } public NutchAnalysis(NutchAnalysisTokenManager tm) { token_source = tm; token = new Token(); jj_ntk = -1; jj_gen = 0; for (int i = 0; i < 16; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } public void ReInit(NutchAnalysisTokenManager tm) { token_source = tm; token = new Token(); jj_ntk = -1; jj_gen = 0; for (int i = 0; i < 16; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } final private Token jj_consume_token(int kind) throws ParseException { Token oldToken; if ((oldToken = token).next != null) token = token.next; else token = token.next = token_source.getNextToken(); jj_ntk = -1; if (token.kind == kind) { jj_gen++; if (++jj_gc > 100) { jj_gc = 0; for (int i = 0; i < jj_2_rtns.length; i++) { JJCalls c = jj_2_rtns[i]; while (c != null) { if (c.gen < jj_gen) c.first = null; c = c.next; } } } return token; } token = oldToken; jj_kind = kind; throw generateParseException(); } @SuppressWarnings("serial") static private final class LookaheadSuccess extends java.lang.Error { } final private LookaheadSuccess jj_ls = new LookaheadSuccess(); final private boolean jj_scan_token(int kind) { if (jj_scanpos == jj_lastpos) { jj_la--; if (jj_scanpos.next == null) { jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken(); } else { jj_lastpos = jj_scanpos = jj_scanpos.next; } } else { jj_scanpos = jj_scanpos.next; } if (jj_rescan) { int i = 0; Token tok = token; while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; } if (tok != null) jj_add_error_token(kind, i); } if (jj_scanpos.kind != kind) return true; if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls; return false; } final public Token getNextToken() { if (token.next != null) token = token.next; else token = token.next = token_source.getNextToken(); jj_ntk = -1; jj_gen++; return token; } final public Token getToken(int index) { Token t = lookingAhead ? jj_scanpos : token; for (int i = 0; i < index; i++) { if (t.next != null) t = t.next; else t = t.next = token_source.getNextToken(); } return t; } final private int jj_ntk() { if ((jj_nt=token.next) == null) return (jj_ntk = (token.next=token_source.getNextToken()).kind); else return (jj_ntk = jj_nt.kind); } private java.util.Vector<int[]> jj_expentries = new java.util.Vector<int[]>(); private int[] jj_expentry; private int jj_kind = -1; private int[] jj_lasttokens = new int[100]; private int jj_endpos; private void jj_add_error_token(int kind, int pos) { if (pos >= 100) return; if (pos == jj_endpos + 1) { jj_lasttokens[jj_endpos++] = kind; } else if (jj_endpos != 0) { jj_expentry = new int[jj_endpos]; for (int i = 0; i < jj_endpos; i++) { jj_expentry[i] = jj_lasttokens[i]; } boolean exists = false; for (java.util.Enumeration<int[]> e = jj_expentries.elements(); e.hasMoreElements();) { int[] oldentry = (e.nextElement()); if (oldentry.length == jj_expentry.length) { exists = true; for (int i = 0; i < jj_expentry.length; i++) { if (oldentry[i] != jj_expentry[i]) { exists = false; break; } } if (exists) break; } } if (!exists) jj_expentries.addElement(jj_expentry); if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind; } } public ParseException generateParseException() { jj_expentries.removeAllElements(); boolean[] la1tokens = new boolean[20]; for (int i = 0; i < 20; i++) { la1tokens[i] = false; } if (jj_kind >= 0) { la1tokens[jj_kind] = true; jj_kind = -1; } for (int i = 0; i < 16; i++) { if (jj_la1[i] == jj_gen) { for (int j = 0; j < 32; j++) { if ((jj_la1_0[i] & (1<<j)) != 0) { la1tokens[j] = true; } } } } for (int i = 0; i < 20; i++) { if (la1tokens[i]) { jj_expentry = new int[1]; jj_expentry[0] = i; jj_expentries.addElement(jj_expentry); } } jj_endpos = 0; jj_rescan_token(); jj_add_error_token(0, 0); int[][] exptokseq = new int[jj_expentries.size()][]; for (int i = 0; i < jj_expentries.size(); i++) { exptokseq[i] = jj_expentries.elementAt(i); } return new ParseException(token, exptokseq, tokenImage); } final public void enable_tracing() { } final public void disable_tracing() { } final private void jj_rescan_token() { jj_rescan = true; for (int i = 0; i < 3; i++) { try { JJCalls p = jj_2_rtns[i]; do { if (p.gen > jj_gen) { jj_la = p.arg; jj_lastpos = jj_scanpos = p.first; switch (i) { case 0: jj_3_1(); break; case 1: jj_3_2(); break; case 2: jj_3_3(); break; } } p = p.next; } while (p != null); } catch(LookaheadSuccess ls) { } } jj_rescan = false; } final private void jj_save(int index, int xla) { JJCalls p = jj_2_rtns[index]; while (p.gen > jj_gen) { if (p.next == null) { p = p.next = new JJCalls(); break; } p = p.next; } p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla; } static final class JJCalls { int gen; Token first; int arg; JJCalls next; } }