package germandeli; import java.awt.Image; import java.awt.image.BufferedImage; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.File; import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.LinkedList; import javax.imageio.ImageIO; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.InvalidXMLException; import org.apache.uima.util.XMLInputSource; public class ScanProductPage { /** * Scans a germandeli product page * @param args */ public static void main(String[] args) { /* * Initialization of the analysis engines */ //Create Analysis engine for Product features AnalysisEngine ae = null; try { ae = ScanProductPage.loadAnalysisEngine("descriptors/ProductAEDescriptor.xml"); } catch (InvalidXMLException e) { System.err.println("Descriptor wrong"); e.printStackTrace(); } catch (ResourceInitializationException e) { System.err.println("Initialization Problem"); e.printStackTrace(); } catch (IOException e) { System.err.println("Could not read the file"); e.printStackTrace(); } //Create Analysis engine for writing the Product features to xml AnalysisEngine writer = null; try { writer = ScanProductPage.loadAnalysisEngine("descriptors/ProductSaveToXMLAE.xml"); } catch (InvalidXMLException e) { System.err.println("Descriptor wrong"); e.printStackTrace(); } catch (ResourceInitializationException e) { System.err.println("Initialization Problem"); e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } //Create Jcas if (ae != null) { ArrayList<String> pagesToScan = ScanProductPage.readInStringsFromDirectory("examplePagesGermanDeli"); System.out.println(pagesToScan.size()); if (pagesToScan != null) { while (!pagesToScan.isEmpty()) { String page = pagesToScan.remove(0); JCas aJCas = null; try { aJCas = ae.newJCas(); } catch (ResourceInitializationException e) { System.err.println("Initialization Problem"); e.printStackTrace(); } //set document aJCas.setDocumentText(page); //process document try { ae.process(aJCas); } catch (AnalysisEngineProcessException e) { System.err.println("Error processing aJcas"); e.printStackTrace(); } //write to file if (writer != null) { try { writer.process(aJCas); } catch (AnalysisEngineProcessException e) { System.err.println("Problem writting to the xml file"); e.printStackTrace(); } } } } else { System.out.println("Directory is empty"); } } } public static String readStringFromFile(String fileName) throws IOException { File file = new File(fileName); FileReader fileReader = new FileReader(file); BufferedReader buffReader = new BufferedReader(fileReader); String result = ""; while (true) { String line = buffReader.readLine(); //System.out.println(line); if (line == null) { break; } else { result += line; } } buffReader.close(); fileReader.close(); return result; } /** * Loads the Analysis Engine from a descriptor file * @param descriptor * @return * @throws IOException * @throws InvalidXMLException * @throws ResourceInitializationException */ public static AnalysisEngine loadAnalysisEngine(String descriptor) throws IOException, InvalidXMLException, ResourceInitializationException { XMLInputSource in = null; ResourceSpecifier specifier = null; //Load the Descriptor in = new XMLInputSource(descriptor); specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in); //Create Analysis Engine //ae = return UIMAFramework.produceAnalysisEngine(specifier); } /** * Reads in a directory and returns the files as strings * @param directory * @return */ public static ArrayList<String> readInStringsFromDirectory(String directory) { File file = new File(directory); ArrayList<String> resultList = new ArrayList<String>(); if (!file.isDirectory()) { return null; } //To do implement filenameFilter to filter files File[] files = file.listFiles(); //read each file for (int i = 0;i < files.length;i++) { if (files[i].isDirectory()) { resultList.addAll(readInStringsFromDirectory(file.getName())); } else { try { resultList.add(readStringFromFile(files[i].getPath())); } catch (IOException e) { System.err.println("Error reading file: " + files[i].getName() + "the file will not be added to the list"); e.printStackTrace(); } } } return resultList; } /** * Load an Image and save it to a File * @param urlName * @param fileName * @throws IOException */ public static void loadImageAndSaveToFile(String urlName,String fileName) throws IOException { URL start = null; try { start = new URL(urlName); } catch(MalformedURLException e) { start = new URL("http://"+urlName); } BufferedImage img = null; try { img = ImageIO.read(start.openStream()); } catch (IOException e) { } if (img != null) { File outputFile = new File("Results/" + fileName + ".jpg"); ImageIO.write(img, "jpg", outputFile); } else { System.err.println("Image " + fileName + "could not be read"); } } }