/** * Treats documents as text. Parses out only DOCNO. * XML Tags are discarded */ package org.apache.lucenesandbox.xmlindexingdemo; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import java.io.*; import java.util.*; public class XMLDocumentHandlerTxt { private Document doc; private ArrayList docs; /** * @param xmlFile * @param docTag - Tag that indicates document */ public XMLDocumentHandlerTxt(File xmlFile) throws IOException { docs = new ArrayList(); parse(xmlFile); } public void parse( File file) throws IOException { BufferedReader reader = new BufferedReader( new FileReader( file ) ); StringBuffer strb = new StringBuffer(100); String docline = ""; boolean intext = false; boolean indocline = false; while ( reader.ready() ) { String line = reader.readLine(); if ( line.indexOf( "</DOCNO>" ) != -1 ) { docline += line; doc = createDoc( docline ); docs.add( doc ); docline = ""; indocline = false; } else if ( line.indexOf( "<DOCNO>" ) != -1 ) { docline += line; indocline = true; } else if (indocline) { docline += line.replaceAll("[!]", ""); } else if ( line.toUpperCase().indexOf("<TEXT>") != -1 ) { intext = true; } else if (line.toUpperCase().indexOf("</TEXT>") != -1 ) { intext = false; } // Add contents to a doc else if ( doc != null && intext) { // Strip out xml tags String txt = stripTagsOut( line ); strb.append( " " + txt + " " ); } // If we hit the end of doc then add text to it if ( line.indexOf( "</DOC>" ) != -1 ) { doc.add(new Field("text", strb.toString(), Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); strb = new StringBuffer(100); } } reader.close(); } public String stripTagsOut( String str) { boolean ignore = false; StringBuffer strb = new StringBuffer(); for ( int i = 0; i < str.length(); i++ ) { char ch = str.charAt( i ); if ( ch == '<' ) { ignore = true; } else if ( ch == '>' ) { ignore = false; } else if ( !ignore ) { strb.append( ch ); } } // Strip out some html tags - ␣ &hyph; & § String strP = strb.toString(); strP = strP.replaceAll( "␣|&hyph;|&|§|•", " " ); return strP; } /** * Creates empty doc from a str of form: * <DOCNO> FBIS3-1 </DOCNO> */ public Document createDoc( String str ) { // DocNo is a second token StringTokenizer tknzr = new StringTokenizer( str, " \t\n\r\f<>" ); tknzr.nextToken(); String docNo = tknzr.nextToken().trim(); Document doc = new Document(); //noc.add( Field.Text( "DOCNO", docNo ) ); doc.add(new Field("DOCNO", docNo, Field.Store.YES, Field.Index.NOT_ANALYZED)); return doc; } public ArrayList getDocuments() { return docs; } }