/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.rtf;
// RTF Parser imports
import com.etranslate.tm.processing.rtf.RTFParserDelegate;
// JDK imports
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
// Nutch imports
import org.apache.nutch.metadata.DublinCore;
import org.apache.nutch.metadata.Office;
/**
* A parser delegate for handling rtf events.
* @author Andy Hedges
*/
public class RTFParserDelegateImpl implements RTFParserDelegate {
String tabs = "";
Properties metadata = new Properties();
String[] META_NAMES_TEXT = {
DublinCore.TITLE,
DublinCore.SUBJECT,
Office.AUTHOR,
"manager",
"company",
"operator",
"category",
Office.KEYWORDS,
Office.COMMENTS,
"doccomm",
"hlinkbase"
};
String[] META_NAMES_DATE = {
"creatim",
"creatim",
"printim",
"buptim"
};
String metaName = "";
List metaNamesText = Arrays.asList(META_NAMES_TEXT);
List metaNamesDate = Arrays.asList(META_NAMES_DATE);
boolean isMetaTextValue = false;
boolean isMetaDateValue = false;
String content = "";
boolean justOpenedGroup = false;
boolean ignoreMode = false;
public void text(String text, String style, int context) {
justOpenedGroup = false;
if (isMetaTextValue && context == IN_INFO) {
metadata.setProperty(metaName, text);
isMetaTextValue = false;
} else if (context == IN_DOCUMENT && !ignoreMode) {
content += text;
}
}
public void controlSymbol(String controlSymbol, int context) {
if("\\*".equals(controlSymbol) && justOpenedGroup){
ignoreMode = true;
}
justOpenedGroup = false;
}
public void controlWord(String controlWord, int value, int context) {
justOpenedGroup = false;
controlWord = controlWord.substring(1);
switch (context) {
case IN_INFO:
if (metaNamesText.contains(controlWord)) {
isMetaTextValue = true;
metaName = controlWord;
} else if (metaNamesDate.contains(controlWord)) {
//TODO: collect up the dates
}
break;
case IN_DOCUMENT:
//System.out.println(controlWord);
break;
}
}
public void openGroup(int depth) {
justOpenedGroup = true;
}
public void closeGroup(int depth) {
justOpenedGroup = false;
ignoreMode = false;
}
public void styleList(List styles) {
}
public void startDocument() {
}
public void endDocument() {
}
public String getText() {
return content;
}
public Properties getMetaData() {
return metadata;
}
}