package germandeli;
import java.io.IOException;
import java.util.List;
import java.util.regex.Matcher;
import net.htmlparser.jericho.*;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.StringArray;
import org.apache.uima.jcas.cas.StringList;
import org.apache.uima.resource.ResourceInitializationException;
public class GermanDeliProductAE extends JCasAnnotator_ImplBase {
private Pattern descrIngred = Pattern.compile("Ingredients: (.*?)\\.");
private final String sensitiv1 = "gdcom_2194_304287"; // 1 not sens.
private final String sensitiv2 = "gdcom_2194_844664"; // 2 heat sens.
private final String sensitiv3 = "not defined"; //3
private final String sensitiv4 = "gdcom_2194_59878433"; //4 frozen
private static final String PARAM_SUPERTYPES = "SuperTypes";
private String superTypesInput = "";
/*
*Specifiers to find the feature in html source.
*ToDo: Make a template
*/
private String rootAttributeSpecifier = "id";
private String rootAttributeSpecifierValue = "bodycontent";
private String nameAttributeSpecifier = "class";
private String nameAttributeSpecifierValue = "title-inner";
private String imageAttributeSpecifier = "class";
private String imageAttributeSpecifierValue = "item-images";
private String persAttributeSpecifier = "class";
private String persAttributeSpecifierValue = "perishable";
private String brandAttributeSpecifier = "class";
private String brandAttributeSpecifierValue = "brand";
private String weightSizeAttributeSpecifier = "class";
private String weightSizeAttributeSpecifierValue = "weight";
private String idAttributeSpecifier = "class";
private String idAttributeSpecifierValue = "code";
private String priceAttributeSpecifier = "class";
private String priceAttributeSpecifierValue = "sale-price";
private String descrAttributeSpecifier = "name";
private String descrAttributeSpecifierValue = "description";
private String ingrAttributeSpecifier = "name";
private String ingrAttributeSpecifierValue = "ingredients";
private String supertypeAttributeSpecifier = "class";
private String supertypeAttributeSpecifierValue = "scBreadcrumbs";
/*
* Flags
*/
private Boolean nameFound = false;
private Boolean imageFound = false;
private Boolean persFound = false;
private Boolean brandFound = false;
private Boolean weightFound = false;
private Boolean idFound = false;
private Boolean priceFound = false;
private Boolean descrFound = false;
private Boolean ingredientsFound = false;
/*
* To do
public void initialize(UimaContext aUimaContext) throws ResourceInitializationException {
// TODO Auto-generated method stub
super.initialize(aUimaContext);
superTypesInput= (String)aUimaContext.getConfigParameterValue(PARAM_SUPERTYPES);
}
*/
public void process(JCas aJCas) throws AnalysisEngineProcessException {
String docText = aJCas.getDocumentText();
initFlags();
//System.out.println(hyperLinkPattern.pattern());
System.out.println("++");
/*
*
*Parse the document
*To do:
*Implement a automaton extractor, which extracts features with an automate.
*The automate should be given as a parameter.
*
*/
Source src = new Source(docText);
MicrosoftConditionalCommentTagTypes.register();
PHPTagTypes.register();
PHPTagTypes.PHP_SHORT.deregister();
src.fullSequentialParse();
List<Element> orderElements = src.getAllElements(HTMLElementName.DIV);
//search for the body or root Elment
Element bodyElement = null;
for (Element tmpElement : orderElements) {
String bodyAttribute = tmpElement.getAttributeValue(rootAttributeSpecifier);
if (bodyAttribute == null) continue;
else {
if (bodyAttribute.equals(rootAttributeSpecifierValue)) {
bodyElement = tmpElement;
break;
}
}
}
//check if root was found
if (bodyElement != null) {
List<Element> bodyElements = bodyElement.getAllElements();
String productName = null;
String brand = null;
String id = null;
String weightSize = null;
String price = null;
String description = null;
String ingredients = null;
String pers = null;
String superTypes = null;
Product product = new Product(aJCas);
aJCas.setDocumentLanguage("en");
for (Element tmpElement : bodyElements) {
//get supertypes
if (superTypes == null) {
String supertypeElementAttr = tmpElement.getAttributeValue(supertypeAttributeSpecifier);
//System.out.println(nameElementAttr);
product.setBegin(tmpElement.getBegin());
if (supertypeElementAttr != null) {
if (supertypeElementAttr.equals(supertypeAttributeSpecifierValue)) {
//extract the supertypes
List<Element> supertypeElements = tmpElement.getAllElements(HTMLElementName.A);
superTypes = "";
for (Element tmpSupertype : supertypeElements) {
String tmp = tmpSupertype.getContent().getTextExtractor().toString();
if (tmp != null) {
if (!tmp.equals("Home")) {
superTypes = superTypes + tmp + ",";
}
}
}
System.out.println("Extracted Supertypes: " + superTypes);
}
}
}
//Get the name
if (!nameFound) {
String nameElementAttr = tmpElement.getAttributeValue(nameAttributeSpecifier);
//System.out.println(nameElementAttr);
//product.setBegin(tmpElement.getBegin());
if (nameElementAttr != null) {
if (nameElementAttr.equals(nameAttributeSpecifierValue)) {
productName = tmpElement.getContent().getTextExtractor().toString();
System.out.println("Extracted Product: " + productName);
nameFound = true;
}
}
}
//get image Path
if (!imageFound && nameFound) {
String persElementAttr = tmpElement.getAttributeValue(imageAttributeSpecifier);
if (persElementAttr != null) {
if (persElementAttr.equals(imageAttributeSpecifierValue)) {
//check for the image
List<Element> innerImages = tmpElement.getAllElements(HTMLElementName.IMG);
if (!innerImages.isEmpty()) {
Element img = innerImages.get(0);
pers = img.getAttributeValue("src");
if (pers != null) {
System.out.println("Extracted imageurl:" + pers);
/*
try {
ScanProductPage.loadImageAndSaveToFile(imgSrc, productName.substring(0, 15));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}*/
imageFound = true;
}
}
}
}
}
//Get the peris
if (!persFound) {
String persElementAttr = tmpElement.getAttributeValue(persAttributeSpecifier);
if (persElementAttr != null) {
if (persElementAttr.equals(persAttributeSpecifierValue)) {
//check for the image
List<Element> innerImages = tmpElement.getAllElements(HTMLElementName.IMG);
if (!innerImages.isEmpty()) {
Element img = innerImages.get(0);
String imgSrc = img.getAttributeValue("src");
if (imgSrc != null) {
System.out.println("Extracted perimageurl " + imgSrc);
persFound = true;
}
}
}
}
}
//Get the brand
if (!brandFound) {
String brandElementAttr = tmpElement.getAttributeValue(brandAttributeSpecifier);
if (brandElementAttr != null) {
if (brandElementAttr.equals(brandAttributeSpecifierValue) && tmpElement.getName().equals(HTMLElementName.TR)) {
brand = tmpElement.getContent().getTextExtractor().toString();
brand = brand.substring(7);
System.out.println("Extracted Brand: " + brand);
brandFound = true;
}
}
}
//Get the id
if (!idFound) {
String idElementAttr = tmpElement.getAttributeValue(idAttributeSpecifier);
if (idElementAttr != null) {
if (idElementAttr.equals(idAttributeSpecifierValue) && tmpElement.getName().equals(HTMLElementName.TR)) {
id = tmpElement.getContent().getTextExtractor().toString();
id = id.substring(8);
System.out.println("Extracted id: " + id);
idFound = true;
}
}
}
//Get the weightSize
if (!weightFound) {
String weightSizeElementAttr = tmpElement.getAttributeValue(weightSizeAttributeSpecifier);
if (weightSizeElementAttr != null) {
if (weightSizeElementAttr.equals(weightSizeAttributeSpecifierValue) && tmpElement.getName().equals(HTMLElementName.TR)) {
weightSize = tmpElement.getContent().getTextExtractor().toString();
weightSize = weightSize.substring(13);
System.out.println("Extracted Weight: " + weightSize);
weightFound = true;
}
}
}
//Get the price
if (!priceFound) {
String priceElementAttr = tmpElement.getAttributeValue(priceAttributeSpecifier);
if (priceElementAttr != null) {
if (priceElementAttr.equals(priceAttributeSpecifierValue) && tmpElement.getName().equals(HTMLElementName.TR)) {
price = tmpElement.getContent().getTextExtractor().toString();
price = price.substring(12);
System.out.println("Extracted Price: " + price);
priceFound = true;
}
}
}
//Get the descripton
if (!descrFound) {
String descrElementAttr = tmpElement.getAttributeValue(descrAttributeSpecifier);
if (descrElementAttr != null) {
if (descrElementAttr.equals(descrAttributeSpecifierValue) && tmpElement.getName().equals(HTMLElementName.DIV)) {
description = tmpElement.getContent().getTextExtractor().toString();
System.out.println("Extracted description: " + description);
descrFound = true;
product.setEnd(tmpElement.getEnd());
}
}
}
//Get the ingredients
if (!ingredientsFound) {
String ingrElementAttr = tmpElement.getAttributeValue(ingrAttributeSpecifier);
if (ingrElementAttr != null) {
if (ingrElementAttr.equals(ingrAttributeSpecifierValue) && tmpElement.getName().equals(HTMLElementName.DIV)) {
ingredients = tmpElement.getContent().getTextExtractor().toString();
System.out.println("Extracted ingredients:" + ingredients);
ingredientsFound = true;
product.setEnd(tmpElement.getEnd());
}
}
}
/*
* Token extracting done
* check if everything is extracted
*/
}
//System.out.println(count);
/*
* Token extracting done
* check if everything is extracted
*/
if (superTypes != null) {
product.setSupertypes(makeStringList(superTypes.substring(0, superTypes.length()),aJCas));
}
if (nameFound) {
product.setName(productName);
}
if (persFound) {
if (pers.endsWith(sensitiv1)) {
product.setPerishability((short) 1);
//System.out.println(1);
}
else if (pers.endsWith(sensitiv2)) {
product.setPerishability((short) 2);
//System.out.println(2);
}
else if (pers.endsWith(sensitiv3)) {
product.setPerishability((short) 3);
//System.out.println(3);
}
else if (pers.endsWith(sensitiv4)) {
product.setPerishability((short) 4);
//System.out.println(4);
}
else {
product.setPerishability((short) 5);
//System.out.println(5);
}
}
if (brandFound) {
product.setBrand(brand);
}
if (idFound) {
product.setId(id);
}
if (weightFound) {
float weightFloat[] = getGAndOZ(weightSize);
product.setWeightInG(weightFloat[0]);
product.setWeightInOZ(weightFloat[1]);
}
if (priceFound) {
product.setPrice(price);
}
if (descrFound) {
product.setDescription(description);
}
if (!ingredientsFound && descrFound) {
Matcher ingreMatcher = descrIngred.matcher(description);
if (ingreMatcher.find()) {
product.setIngredients(makeStringList(ingreMatcher.group(1),aJCas));
System.out.println("Extracted with Regex from descr:" + ingreMatcher.group(1));
}
}
if (ingredientsFound) {
product.setIngredients(makeStringList(ingredients,aJCas));
}
product.addToIndexes();
}
}
/**
* Makes a list from a string
* @param input
* @param aJCas
* @return
*/
private StringArray makeStringList(String input,JCas aJCas) {
String[] strArray;
input = input.trim();
strArray = input.split(",");
if (strArray != null) {
for(int i = 0; i < strArray.length; i++) {
strArray[i].trim();
}
}
StringArray result = new StringArray(aJCas,strArray.length);
result.copyFromArray(strArray, 0, 0, strArray.length);
return result;
}
/**
* Gets the G and OZ from a string
* @param input
* @return 1 = G,2 = OZ
*/
private float[] getGAndOZ(String input) {
float[] result = new float[2];
Pattern gPattern = Pattern.compile("(\\d+)g");
Matcher gM = gPattern.matcher(input);
if (gM.find()) {
result[0] = Float.valueOf(gM.group(1));
System.out.println(gM.group(1));
}
else {
result[0] = 0;
}
Pattern ozPattern = Pattern.compile("(\\d+)oz");
Matcher ozM = ozPattern.matcher(input);
if (ozM.find()) {
result[1] = Float.valueOf(ozM.group(1));
}
else {
result[1] = 0;
}
return result;
}
private void initFlags() {
nameFound = false;
persFound = false;
brandFound = false;
weightFound = false;
idFound = false;
priceFound = false;
descrFound = false;
imageFound = false;
ingredientsFound = false;
}
}