package com.villemos.ispace.assembler.helper; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.camel.Exchange; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.villemos.ispace.aperture.InformationObject; public class ReferenceIdBodyEnricher { /** The logger. */ private static final Log LOG = LogFactory.getLog(ReferenceIdBodyEnricher.class); protected Integer delta = 1; protected List<String> dontPrint = new ArrayList<String>(); { dontPrint.add("ALOS-ARC/arc.doc"); dontPrint.add("ALOS-CB/cb.doc"); dontPrint.add("ALOS-CS/cs.doc"); dontPrint.add("ALOS-DRTF/drtf.doc"); dontPrint.add("ALOS-FS/fs.doc"); dontPrint.add("ALOS-ING/ing.doc"); dontPrint.add("ALOS-MCS/mcs.doc"); dontPrint.add("ALOS-PD/pd.doc"); dontPrint.add("ALOS-SP/sp.doc"); dontPrint.add("CARE-CMS/OIVV Test Report CARE-CMS 3.1.0 v1.0.doc"); dontPrint.add("CARE-CMS/OIVV Test Report CARE-CMS 3.2.0 v1.0.doc"); //dontPrint.add("CARE-CMS/OIVV-CARECMS-v3.0.1v0.1-TVR.doc"); dontPrint.add("CDS-CI-MC2/ProductDeletionNotificationInterface.doc"); dontPrint.add("CDS-SCI-GEST/OIVV-GEST-v2 4 5-TVR v1.doc"); dontPrint.add("CDS-SCI-SPDM/OIVV-SPDM-v2 7 1-TVR v1 .doc"); dontPrint.add("CDS-SCI-SPDM/OIVV-SPDM-v2.8-TVR v1.doc"); dontPrint.add("CDS-SPR-AR/MS901 activation key.pdf"); dontPrint.add("CDS-SPR-AR/TP-07.05 Statistics validation and distribution.doc"); dontPrint.add("Cryosat-PDS-SDF/C2-RN-ACS-GS-0405_(RRN)_PDS_1.2.5_v1.1.pdf"); dontPrint.add("Cryosat-PDS-SDF/PDS UpdateProcedure_1_2_5.pdf"); dontPrint.add("CS-DIS/OIVV-CS-DIS-1.11.00-v1.0TVR.doc"); dontPrint.add("CS-DIS/[SRN] EFC4-ACS-SR-12-10316-217_v1_1.doc"); // Should be parsable...? dontPrint.add("CUS/CUSTools-STD-INTC-0054 v1.0 (CUSTools v4.0.24 STD SSDD).doc"); // Should be parsable...? dontPrint.add("CUT/Test Validation Report CUT 2 0 5 v1.0.doc"); dontPrint.add("DAIL/OIVV-DAIL-v6 3 2 1-TVR v1.doc"); dontPrint.add("DAIL/OIVV-DAIL-v6.3.4-TV v1.doc"); dontPrint.add("DDS-TDDM/FilterReport.pdf"); dontPrint.add("DESCW/DESCW_SAOM.doc"); dontPrint.add("DESCW/OIVV_Test_Report_DESCW_4.74 v1.doc"); dontPrint.add("DESCW/OIVV_Test_Report_DESCW_4.74_v2.1.doc"); dontPrint.add("DESCW/OIVV_Test_Report_DESCW_4.74_v2.2.doc"); dontPrint.add("DESCW/OIVV_Test_Report_DESCW_4.75 V1.doc"); dontPrint.add("DRS/OIVV-DRS-1.3.0-TVR v1 doc.doc"); dontPrint.add("E-OA/eoa installation.doc"); dontPrint.add("E-OA/OIVV-E-OA-1.02.00-TVR-V1.1.doc"); dontPrint.add("E-OA/[srn] efc2-acs-sr-09-03495-172.doc"); dontPrint.add("E-OA/[SRN] EFC2-ACS-SR-10-4439-183.doc"); dontPrint.add("E-OA/[SRN] EFC4-ACS-SR-09-3431-0168.doc"); dontPrint.add("E-OA/[SRN]_EFC4-ACS-SR-E-OA-0163.doc"); dontPrint.add("EOLI-SA/EOLI_Server_SAOM_4.0.doc"); dontPrint.add("EOLI-SA/OIVV Test Report EOLI SA 9 1 2 v1.doc"); dontPrint.add("EOLI-SA/OIVV Test Report EOLI SA 9.1.0 v1 .doc"); dontPrint.add("EOLI-SA/OIVV Test Report EOLI SA 9.1.3 V1.doc"); dontPrint.add("EOLI-SA/OIVV Test Report EOLI SA 9.1.4V1.doc"); dontPrint.add("EOLI-SA/OIVV Test Report EOLI0VSA 09.1.1V1.doc"); dontPrint.add("EOLI-SA/OIVV Test Report EOLISA 9.0.0 V.1.doc"); dontPrint.add("EOLI-SA/OIVV-EOLISSA-v7.2.2-TVR 1-0.doc"); dontPrint.add("EOLI-SA/VEGA-EOLI-SA-SRN-226.doc"); dontPrint.add("EOLI-SA/VEGA-EOLI-SA-SRN-263.doc"); dontPrint.add("EOLI-SA/VEGA-EOLI-SA-SRS_v7.2.1.doc"); dontPrint.add("EOLI-Server/eoli-server_patch_2_9_1_installation.doc"); dontPrint.add("EOLI-Server/OIVV Test Report EOLI Server 2.10.4.doc"); dontPrint.add("EOLI-Server/OIVV_Test_Report_EOLI_Server_2_10.doc"); dontPrint.add("EOLI-Server/OIVV_Test_Report_EOLI_Server_3.0.3v1.doc"); dontPrint.add("EOLI-Server/OIVV_Test_Report_EOLI_Server_3.0.V1.2.doc"); dontPrint.add("EOLI-Server/OIVV_Test_Report_EOLI_Server_3.0.v1.doc"); dontPrint.add("EOLI-Server/OIVV_Test_Report_EOLI_Server_4.0.1 v1.doc"); dontPrint.add("EOLI-Server/OIVV_Test_Report_EOLI_Server_4.0.2 V1.doc"); dontPrint.add("EOLI-Server/OIVV_Test_Report_EOLI_Server_4.0.3[1].doc"); dontPrint.add("EOLI-Server/OIVV_Test_Report_EOLI_Server_4.0.4_v1.doc"); dontPrint.add("EOLI-Server/OIVV_Test_Report_EOLI_Server_4.0.5_v1.0.doc"); dontPrint.add("EOLI-Server/OIVV_Test_Report_EOLI_Server_4.0_v1.doc"); dontPrint.add("EOLI-Server/vega-eoli-server-srn-182.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-232(1).doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-235.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-244.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-247.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-249.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-260.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-275_v3.0.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-287_v3.0.1.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-299_v3.0.2.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-303_v4.0.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-304_v3.0.3.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-312_v3.0.4.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-318_v4.0.1.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-323_v4.0.2.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-327_v4.0.3.doc"); dontPrint.add("EOLI-Server/VEGA-EOLI-Server-SRN-328_v4.0.4.doc"); dontPrint.add("EXTPS/envisat_2010_unittest.pdf"); dontPrint.add("HigherLevel/AE-ID-ESA-GS-001_Master_ICD_2.2C.pdf"); dontPrint.add("GSA/gsa_sdd.1.1.pdf"); // Cant figure out pattern... dontPrint.add("GSA/gsa_srd.1.5.doc"); // Cant figure out pattern... dontPrint.add("HigherLevel/aden-gs-veg-id-0028-02_2.pdf"); dontPrint.add("Cryosat-PDS-ARF/ESA-MA-ACS-GS-0105_[SUM-ARF]_v1.1.pdf"); // Unreadable dontPrint.add("Cryosat-PDS-IPF/ESA-MA-ACS-GS-0106_[SUM-IPF]_v1.1.pdf"); // Unreadable dontPrint.add("Cryosat-PDS-MCF/ESA-MA-ACS-GS-0103_[SUM-MCF]_v1.1.pdf"); // Unreadable dontPrint.add("Cryosat-PDS-SDF/ESA-MA-ACS-GS-0104_[SUM-SDF]_v1.1.pdf"); // Unreadable dontPrint.add("CS-DIS/efc4-acs-tp-mdpscs-0117.pdf"); // Unreadable dontPrint.add("HigherLevel/AE-ID-ESC-FS-3000_FOS_MMPF_ICD_2.3.pdf"); dontPrint.add("HigherLevel/ch3_17.doc"); dontPrint.add("HigherLevel/ch4_17.doc"); dontPrint.add("HigherLevel/descw_if.pdf"); dontPrint.add("HigherLevel/GSC-IC-52-8920 R2-CDS ICD 2-2.pdf"); dontPrint.add("HigherLevel/INFEO_NG - Core ICD.pdf"); dontPrint.add("HigherLevel/IPF_System_Status.pdf"); dontPrint.add("HigherLevel/N7950-SPOT-VGT-CCN3-CDS IF ICD Tailoring-v1.4.pdf"); dontPrint.add("HigherLevel/N7950-SPOT-VGT-CCN3-HMA Catalogue ICD Tailoring-v1.4.pdf"); dontPrint.add("HigherLevel/N7950-SPOT-VGT-CCN3-HMA Ordering ICD Tailoring-v1.4.pdf"); dontPrint.add("HigherLevel/OES01-99073-SDS-ICD.pdf"); dontPrint.add("HigherLevel/OSMV-OPMT-EOPG-TN-10-0001 v1.1 (EOP-G Technical Baseline).doc"); dontPrint.add("HigherLevel/Taitus Generic Schema_1.1.doc"); dontPrint.add("HigherLevel/TMapsICD.pdf"); dontPrint.add("HigherLevel/TTS-IND-ICD-001- MMS ICD.pdf"); dontPrint.add("HigherLevel/VEGA-EOLI-Server-ICD-195-1.3.doc"); dontPrint.add("INFEO-EOLI/EOLI-INFEO-EVO-Installation_Procedure.doc"); dontPrint.add("INFEO-ING/ING-Installation_Procedure.doc"); dontPrint.add("INFEO-ING/SOE-ING-ADD-Annex-Javadoc.pdf"); dontPrint.add("INFEO-ING/SOE-ING-SOAM-Annex-ECHO_OA.pdf"); dontPrint.add("LI/2008-07-22_li_training_esrin.pdf"); dontPrint.add("LI/2008-07-23_li_training_esrin.pdf"); dontPrint.add("LI/2008-07-24_li_training_esrin.pdf"); dontPrint.add("LI/OIVV-PL-LI-1 01 01-TVR v1.doc"); dontPrint.add("LI/pl_oql_um.pdf"); dontPrint.add("LI/PL_Service_Manual-1.3.pdf"); dontPrint.add("LI/PL_User_Manual-1.2.pdf"); dontPrint.add("MACH/MACH-D06-SystemTestPlan1.2.doc"); dontPrint.add("MACH/MACH-D11-OT_ArchitectureDocument.pdf"); dontPrint.add("MACH/MACH-D12-OT_SoftwareUserManual.pdf"); dontPrint.add("MERCI/merci-atp-1.9.1.doc"); dontPrint.add("MERCI/MERCI-ATP-1.9.2.doc"); dontPrint.add("MERCI/MERCI-ATP-1.9.3.doc"); dontPrint.add("MERCI/MERCI-ATP-2.0.0.doc"); dontPrint.add("MERCI/MERCI-FAT-REPORT-1.9.3.doc"); dontPrint.add("MERCI/MERCI-FAT-REPORT-2.0.0.doc"); dontPrint.add("MERCI/merci-saom-1.9.1.doc"); dontPrint.add("MERCI/MERCI-SRN-1.9.3.doc"); dontPrint.add("MERCI/MERCI-SRN-2.0.0.doc"); dontPrint.add("MERCI/MERCI_Manual_1_9_2.doc"); dontPrint.add("MERCI/MERCI_Manual_1_9_3.doc"); dontPrint.add("MERCI/MERCI_Manual_2_0_0.doc"); dontPrint.add("MMOHS/Test procedure for MMOHS-CR-10-01212.doc"); dontPrint.add("OT/SRN-OT-MMFI-1.01.00.pdf"); dontPrint.add("OT/sum-32.5.7.pdf"); dontPrint.add("PFD/pfd_ten_mmfi_0535_eads.doc"); dontPrint.add("RSE/RSE_SRD_1-0.doc"); dontPrint.add("Savoir/OSME-USMP-SEDA-RS-07-1373 v4.0 (SaVoir Visualisation Tool Requirements Specification).doc"); dontPrint.add("SDS/DOX-RBE-#41390-v1-Station_Data_Server-Software_Installation_Manual.pdf"); dontPrint.add("SDS/DOX-RBE-#41391-v1-Station_Data_Server_-Software_User_Manual.pdf"); dontPrint.add("StatRep/CRQ4452 - Support for RSIF loading on Statrep2.doc"); dontPrint.add("StatRep/DWHE-MISSION-INTEGRATION-GUIDE-TN-3200-INT-1.1.doc"); dontPrint.add("ULS/DIMS_UL-User-Manual_1.0.pdf"); dontPrint.add("UM-SSO/SIE-EO-OP-UM-SSO-SRD-2.2.1_Signed.pdf"); dontPrint.add("UM-SSO/updatefrom181to1811.doc"); dontPrint.add("VSSGS/VSSGS Administrator Manual.doc"); dontPrint.add("VSSGS/vssgs.pdf"); dontPrint.add("WMS2EOS/SM-WMS2EOS-SAOMTD-13.doc"); dontPrint.add("WMS2EOS/SM-WMS2EOS-SAOMTD-14.doc"); dontPrint.add("WMS2EOS/SM-WMS2EOS-SDD-12.doc"); } public class PatternEntry { public PatternEntry(Integer group, String name) { super(); this.group = group; this.name = name; } public Integer group; public String name; } protected Map<Pattern, List<PatternEntry>> patterns = new HashMap<Pattern, List<PatternEntry>>(); { patterns.put(Pattern.compile("Title:{0,1}(.{0,100}?)Contract Ref\\.:{0,1}(.{0,100}?)Issue:{0,1}(.{0,10}?)Rev\\.:{0,1}(.{0,10}?)Document Ref\\.:{0,1}(.{0,100}?)Consortium Reference(.{0,100}?)Date:{0,1}", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref")); add(new PatternEntry(6, "body misc 1"));}}); patterns.put(Pattern.compile("Title:{0,1}(.{0,100}?)Contract Ref\\.:{0,1}(.{0,100}?)Issue:{0,1}(.{0,10}?)Rev\\.:{0,1}(.{0,10}?)Consortium Ref\\.:{0,1}(.{0,100}?)Date:{0,1}", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref"));}}); patterns.put(Pattern.compile("Title:{0,1}(.{0,200}?)Contract Ref\\.:{0,1}(.{0,100}?)Doc\\. Ref\\.:{0,1}(.{0,100}?)Issue:{0,1}(.{0,10}?)Rev\\.:{0,1}(.{0,10}?)Consortium(.{0,10}?)Reference(.{0,100}?)Date:{0,1}", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); add(new PatternEntry(7, "body misc 1"));}}); patterns.put(Pattern.compile("Title:{0,1}(.{0,100}?)Contract Ref\\.:{0,1}(.{0,100}?)Issue:{0,1}(.{0,10}?)Rev\\.:{0,1}(.{0,10}?)Document Ref\\.:{0,1}(.{0,100}?)Date:{0,1}", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref"));}}); patterns.put(Pattern.compile("Title:{0,1}(.{0,100}?)Contract Ref\\.:{0,1}(.{0,100}?)Consortium Ref\\.:{0,1}(.{0,100}?)Issue:{0,1}(.{0,10}?)Rev\\.:{0,1}(.{0,10}?)Date:{0,1}", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref"));}}); patterns.put(Pattern.compile("Title:{0,1}(.{0,200}?)Contract Ref\\.:{0,1}(.{0,100}?)Doc\\. Ref\\.:{0,1}(.{0,100}?)Consortium(.{0,10}?)Reference(.{0,100}?)Issue:{0,1}(.{0,10}?)Rev\\.:{0,1}(.{0,10}?)Date:{0,1}", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); add(new PatternEntry(5, "body misc 1"));}}); patterns.put(Pattern.compile("Title:{0,1}(.{0,100}?)Contract Ref\\.:{0,1}(.{0,100}?)Consortium Reference(.{0,100}?)Issue:{0,1}(.{0,10}?)Rev\\.:{0,1}(.{0,10}?)Date:{0,1}", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref"));}}); patterns.put(Pattern.compile("Title:{0,1}(.{0,100}?)Contract Ref\\.:{0,1}(.{0,100}?)Issue:{0,1}(.{0,10}?)Rev\\.:{0,1}(.{0,10}?)Consortium Reference(.{0,100}?)Date:{0,1}", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref"));}}); patterns.put(Pattern.compile("Title:{0,1}(.{0,100}?)Issue:{0,1}(.{0,100}?)Rev\\.:{0,1}(.{0,100}?)Consortium Ref\\.:{0,1}(.{0,100}?)Date:{0,1}", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(4, "body doc ref"));}}); patterns.put(Pattern.compile("Filename:(.{0,100}?)page(.{0,100}?)Last update:(.{0,100}?)Title:(.{0,100}?)\\((.{0,100}?)\\)\\s+by", Pattern.DOTALL), new ArrayList() {{ add(new PatternEntry(5, "body doc ref"));}}); patterns.put(Pattern.compile("Contract No :(.{0,100}?)WP No :(.{0,100}?)Document Ref :(.{0,100}?)Issue Date :(.{0,100}?)Issue :(.{0,100}?)$", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref"));}}); patterns.put(Pattern.compile("DocRef: (.{0,100}?)Sub-Contractor:(.{0,100}?)Issue:(.{0,100}?)Procedure Name:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Code(.{0,5}?):(.{0,100}?)Issue(.{0,5}?):(.{0,100}?)Date(.{0,5}?):(.{0,100}?)", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Doc\\. No\\.:(.{0,100}?)Issue:{0,1}(.{0,10}?)Date:{0,1}(.{0,10}?)Page(.{0,5}?):(.{0,5}?)$(.{0,100}?)$", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(6, "body doc ref")); }}); patterns.put(Pattern.compile("Doc\\. No\\.:(.{0,100}?)Issue:{0,1}(.{0,10}?)Date:{0,1}(.{0,10}?)Page(.{0,5}?):(.{5,100}?)$", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref")); }}); patterns.put(Pattern.compile("Doc\\. No\\.:(.{5,100}?)Issue:{0,1}(.{0,10}?)Date:{0,1}(.{0,50}?)Page", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("prepared by/pr�par� par(.{0,200}?)reference/r�ference(.{0,100}?)issue/�dition(.{0,100}?)revision/r�vision(.{0,100}?)date of issue/date d��dition(.{0,100}?)", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(2, "body doc ref")); }}); patterns.put(Pattern.compile("prepared by(.{0,100}?)workpackage(.{0,100}?)reference(.{0,100}?)issue(.{0,10}?)revision(.{0,10}?)date of issue(.{0,100}?)Reviewed by(.{0,100}?)Document type(.{0,100}?)Distribution", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); }}); patterns.put(Pattern.compile("DOC:(.{0,100}?)VER:(.{0,100}?)DATE:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Document Reference:(.{0,100}?)Issue / revision:(.{0,100}?)Issue date:(.{0,100}?)Document author:(.{0,100}?)Document approver:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Issue:(.{0,100}?)Date:(.{0,100}?)ID:(.{0,100}?)$", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); }}); patterns.put(Pattern.compile("Customer:(.{0,100}?)Contract number:(.{0,100}?)Proposal number:(.{0,100}?)Business/service number:(.{0,100}?)Service Delivery Manager:(.{0,100}?)Reporting to:(.{0,100}?)Service delivery/document reference:(.{0,100}?)Issue:(.{0,10}?)Issue date:(.{0,50}?)Period of validity:(.{0,100}?)", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(7, "body doc ref")); }}); patterns.put(Pattern.compile("Implementation(.{0,100}?)Issue(.{0,10}?)Revision(.{0,10}?)Page(.{0,100}?)Date", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Implementation(.{0,100}?)Issue(.{0,30}?)Page(.{0,100}?)Date", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("EO DAIL(.{0,100}?)Issue(.{0,30}?)Page(.{0,10}?)of", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("reference(.{0,100}?)issue(.{0,10}?)date of issue(.{0,30}?)status(.{0,30}?)Document type(.{0,100}?)Distribution", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Document Id:(.{0,100}?)Issue:(.{0,100}?)Revision:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("reference(.{0,100}?)issue(.{0,20}?)date of issue(.{0,50}?)", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Title:(.{0,100}?)Doc Id:(.{0,100}?)Issue:(.{0,100}?)Rev\\.:(.{0,100}?)Proj\\. Ref\\.:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(2, "body doc ref")); }}); patterns.put(Pattern.compile("Doc\\. Id:(.{0,100}?)Issue:(.{0,100}?)Date:(.{0,100}?)Page:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Title:(.{0,100}?)Contract Ref\\.:(.{0,100}?)Document Reference(.{0,100}?)Issue:(.{0,100}?)Rev\\.:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); }}); patterns.put(Pattern.compile("Document Ref(\\s*):(.{0,100}?)Issue Date(\\s*):(.{0,100}?)Issue(\\s*):(.{0,10}?)Title", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(2, "body doc ref")); }}); patterns.put(Pattern.compile("CR Reference(.{0,100}?)Task(.{0,100}?)ESA WP Manager", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Doc No:(.{0,100}?)Issue:(.{0,100}?)Date:(.{0,100}?)Page:(.{0,100}?)^(.{0,100}?)$", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref")); }}); patterns.put(Pattern.compile("reference(.{0,10}?)issue(.{0,10}?)date(.{0,10}?)page(.{0,10}?)^(.{0,100}?)$", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref")); }}); patterns.put(Pattern.compile("Ref\\.:(.{0,100}?)Issue/Revision:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Document Number:(.{0,100}?)Issue/Revision:(.{0,100}?)^", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Short Title(.{0,100}?)Prepared by(.{0,100}?)Approved by(.{0,100}?)Reference(.{0,100}?)Issue(.{0,10}?)Revision(.{0,10}?)Date of issue(.{0,100}?)Status", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(4, "body doc ref")); }}); patterns.put(Pattern.compile("document:(.{0,100}?)Version:(.{0,20}?)Category:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Advanced Computer Systems Name :(.{0,100}?)Version :(.{0,100}?)Date :(.{0,100}?)Page :", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Document No Issue/Rev. No Date Page : (.{0,100}?) : Issue (.{0,100}?) :", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Document Number:(.{0,100}?)Issue Date:(.{0,100}?)Issue:(.{0,100}?)Revision:(.{0,100}?)Distribution:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Doc\\. No\\.:(.{0,100}?)Issue:(.{0,100}?)Revision:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Doc\\. Title:(.{0,100}?)Issue:(.{0,100}?)Doc\\. Ref:(.{0,100}?)Rev\\.:(.{0,100}?)Date:(.{0,100}?)Page:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); }}); patterns.put(Pattern.compile("Reference :(.{0,100}?)Version :(.{0,100}?)Date :", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Doc\\. no:(.{0,100}?), Rev:(.{0,100}?)Page", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Reference:(.{0,100}?)Issue:(.{0,100}?)Revision:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("R�F�RENCE :(.{0,100}?)DATE :(.{0,100}?)^(.{0,100}?)$", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); }}); patterns.put(Pattern.compile("Document No(.{0,100}?)Issue/Rev. No(.{0,100}?)Date(.{0,100}?)Page(.{0,100}?):(.{0,100}?):(.{0,100}?):", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref")); }}); patterns.put(Pattern.compile("Document Id:(.{0,100}?)Issue:(.{0,100}?)Date:(.{0,100}?)Page:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Reference:(.{0,100}?)Issue:(.{0,100}?)Revision:(.{0,100}?)Distribution Code:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Title:(.{0,100}?)Contract Ref.:(.{0,100}?)Int. Ref.:(.{0,100}?)Issue:(.{0,100}?)Rev\\.:(.{0,100}?)Proj\\. Ref\\.:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); }}); patterns.put(Pattern.compile("Document Reference:(.{0,100}?)Document Status:(.{0,100}?)Prepared By:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Client:(.{0,100}?)Project Reference:(.{0,100}?)Document Reference:(.{0,100}?)File Name:(.{0,100}?)Issue:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); }}); patterns.put(Pattern.compile("Client:(.{0,100}?)Solenix Project Reference:(.{0,100}?)Document Reference:(.{0,100}?)Version:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); }}); patterns.put(Pattern.compile("Ref\\.:(.{0,100}?)$(.{0,100}?)Release:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Title:(.{0,100}?)Contract Ref.:(.{0,100}?)Consortium(.{0,100}?)Reference(.{0,100}?)Issue:(.{0,10}?)Rev.:(.{0,10}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(4, "body doc ref")); }}); patterns.put(Pattern.compile("Document Ref\\.:(.{0,100})Issue:(.{0,100})Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Ref :(.{0,100})Issue :(.{0,100})Date :", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Ref:(.{0,100})Issue/Revision:(.{0,100})Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Doc No :(.{0,100})Issue :(.{0,100})Date :", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Ref\\. :(.{0,100})Is\\. :(.{0,100})Rev\\. :(.{0,100})Date :(.{0,100})Page :", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Doc No:(.{0,100})Issue(.{0,100})Date :(.{0,100})Page :", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Doc No :(.{0,100}?)$", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Ref\\.(.{0,100}?)Issue(.{0,100}?)Rev.(.{0,100}?)Page(.{0,100}?):(.{0,100}?):(.{0,100}?):", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref")); }}); patterns.put(Pattern.compile("Workpackage:(.{0,100}?)Doc. Ref.:(.{0,100}?)Version:(.{0,100}?)Status:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(2, "body doc ref")); }}); patterns.put(Pattern.compile("Doc\\. Ref\\.:(.{0,100}?)Version:(.{0,100}?)Status:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Document id(.{0,100}?)Author(.{0,100}?)Version(.{0,100}?)Issue date", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Ref\\. :(.{0,100}?)Issue :(.{0,100}?)Date :", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Ref\\. :(.{0,100}?)Iss\\./Rev\\. :(.{0,100}?)Date:(.{0,100}?)^(.{0,100}?)$", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(4, "body doc ref")); }}); patterns.put(Pattern.compile("Prepared by(.{0,100}?)Reference(.{0,100}?)Issue(.{0,100}?)Revision(.{0,100}?)Status(.{0,100}?)Date of issue", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(2, "body doc ref")); }}); patterns.put(Pattern.compile("Prepared by(.{0,100}?)Reference(.{0,100}?)Issue(.{0,100}?)Revision(.{0,100}?)Date of Issue", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(2, "body doc ref")); }}); patterns.put(Pattern.compile("R�f�rence :(.{0,100}?)Version :(.{0,100}?)Date :(.{0,100}?)Page :", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Rev\\. :(.{0,100}?)Date :(.{0,100}?)Reference :(.{0,100}?)Page", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); }}); patterns.put(Pattern.compile("Issue :(.{0,100}?)Date :(.{0,100}?)Revision :(.{0,100}?)Date :(.{0,100}?)Ref. :(.{0,100}?)$", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref")); }}); patterns.put(Pattern.compile("Project:(.{0,100}?)Doc\\.Ref\\.:(.{0,100}?)Issue:(.{0,100}?)Date:(.{0,100}?)Status:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(2, "body doc ref")); }}); patterns.put(Pattern.compile("Service Delivery Manager:(.{0,100}?)Reporting to:(.{0,100}?)Service delivery/document reference:(.{0,100}?)Issue:(.{0,100}?)First Issue date", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); }}); patterns.put(Pattern.compile("Contract Number(.{0,100}?)Service Delivery Document Reference(.{0,100}?)Service Delivery Manager", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(2, "body doc ref")); }}); patterns.put(Pattern.compile("Contract Number(.{0,100}?)Service Delivery Document Reference(.{0,100}?)Service Delivery Manager", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(2, "body doc ref")); }}); patterns.put(Pattern.compile("Title:(.{0,100}?)Contract Ref\\.:(.{0,100}?)Issue:(.{0,100}?)Rev\\.:(.{0,100}?)Proj\\. Ref\\.:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref")); }}); patterns.put(Pattern.compile("Title:(.{0,100}?)Contract Ref\\.:(.{0,100}?)Issue:(.{0,100}?)Rev\\.:(.{0,100}?)DocumentConsortium Ref.:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref")); }}); patterns.put(Pattern.compile("Document ID:(.{0,100}?)Title:(.{0,100}?)Issue:(.{0,100}?)Issue Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref")); }}); patterns.put(Pattern.compile("Contract Ref\\.:(.{0,100}?)Doc\\. Ref\\.:(.{0,100}?)Issue:(.{0,100}?)Rev\\.:(.{0,100}?)Consortium Reference:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(2, "body doc ref")); add(new PatternEntry(5, "body misc 1"));}}); patterns.put(Pattern.compile("Title:(.{0,100}?)Contract Ref\\.:(.{0,100}?)Issue:(.{0,100}?)Rev\\.:(.{0,100}?)Consortium(.{0,10}?)Ref\\.:(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(6, "body doc ref")); }}); patterns.put(Pattern.compile("Title:(.{0,100}?)Date:(.{0,100}?)Doc\\. Ref\\.:(.{0,100}?)Issue:(.{0,100}?)Rev:(.{0,100}?)Consortium Ref\\.:(.{0,100}?)Contract Ref\\.:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); add(new PatternEntry(6, "body misc 1"));}}); patterns.put(Pattern.compile("Ref :(.{0,100}?)Issue :", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref"));}}); patterns.put(Pattern.compile("Reference:(.{0,100}?)Issue:(.{0,100}?)Revision:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(1, "body doc ref"));}}); patterns.put(Pattern.compile("Title:(.{0,100}?)Contract Ref\\.:(.{0,100}?)Issue:(.{0,100}?)Rev\\.:(.{0,100}?)Doc\\. Ref\\.:(.{0,100}?)Date:(.{0,100}?)Consortium Ref.:(.{0,100}?)$", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(5, "body doc ref")); add(new PatternEntry(7, "body misc 1"));}}); patterns.put(Pattern.compile("Title:(.{0,100}?)Contract Ref\\.:(.{0,100}?)Consortium Reference(.{0,100}?)Date:", Pattern.DOTALL | Pattern.MULTILINE), new ArrayList() {{ add(new PatternEntry(3, "body doc ref")); }}); } protected long counter = 0; public void process(Exchange exchange) { InformationObject io = (InformationObject) exchange.getIn().getBody(); counter++; if (io.hasTitle.startsWith("zip:")) { LOG.warn("Found entry that is a ZIP file '" + io.hasTitle + "'."); exchange.setProperty(Exchange.ROUTE_STOP, Boolean.TRUE); return; } long emptyReference = 0; /** Check whether the raw text of this io is readable. */ String deli = "\\\\"; String[] elements = io.hasUri.split(deli); String application = elements[elements.length - 2]; boolean found = false; /** See if the reference is in the document properties. */ String propertyRef = (String) io.metadata.get("DocReference"); if (propertyRef != null && propertyRef.equals("") == false) { //found = true; io.metadata.put("Reference ID (property doc ref)", propertyRef); LOG.info(counter + ". Found reference ID (in properties) '" + propertyRef + "' for document '" + application + "/" + io.hasTitle + "'."); } if (io.metadata.get("Language").equals("en") == false || (Double) io.metadata.get("Language Probability") < 0.9d) { LOG.warn("Ignoring '" + io.hasTitle + "' because it holds no text."); } else { /** First use the patterns. */ Iterator<Entry<Pattern, List<PatternEntry>>> it = patterns.entrySet().iterator(); while (it.hasNext()) { Entry<Pattern, List<PatternEntry>> entry = it.next(); Matcher matcher = entry.getKey().matcher(io.withRawText); if (matcher.find()) { for (PatternEntry patternEntry : entry.getValue()) { String id = matcher.group(patternEntry.group).trim().toUpperCase(); /** TODO: Proper fix. This is a dirty quick fix.*/ id = id.replaceAll("CONSORTIUM REFERENCE", "").trim(); found = true; if (id.equals("") || id.equalsIgnoreCase("insert reference")) { LOG.warn(counter + ". Found empty reference ID for Pattern '" + patternEntry.name + "'."); emptyReference++; } else { LOG.info(counter + ". Found reference ID '" + id + "' for document '" + application + "/" + io.hasTitle + "' using pattern '" + entry.getKey().pattern() + "'."); } io.metadata.put("Reference ID (" + patternEntry.name + ")", id); } } } /** If not found in header. */ if (found == false) { if (dontPrint.contains(application + "/" + io.hasTitle) == false) { // System.out.println(io.withRawText); // System.out.println("Document: " + application + "/" + io.hasTitle); } } /** Then do a count of hits. */ Pattern pattern = Pattern.compile("(\\p{Alpha}\\p{Alnum}{1,}-\\p{Alpha}{2,}-\\p{Alpha}{2,}-\\p{Alpha}{2,}-\\d{2,}(-\\d{2,})*(-\\p{Alpha}{2,})*)"); Matcher matcher = pattern.matcher(io.withRawText); Map<String, Integer> hits = new HashMap<String, Integer>(); while (matcher.find()) { if (hits.containsKey(matcher.group(1))) { Integer count = hits.get(matcher.group(1)) + 1; hits.put(matcher.group(1), count); } else { hits.put(matcher.group(1), 1); } } /** Find highest count and second highest count. */ Integer largest = 0; Integer secondLargest = 0; String id = ""; Iterator<Entry<String, Integer>> it2 = hits.entrySet().iterator(); while (it2.hasNext()) { Entry<String, Integer> entry = it2.next(); if (largest == 0 || largest < entry.getValue()) { secondLargest = largest; largest = entry.getValue(); id = entry.getKey(); } } id = id.toUpperCase(); if (largest - delta >= secondLargest) { found = true; io.metadata.put("Reference ID (body hits)", id); LOG.info(counter + ". Using maxCount method, found reference ID '" + id + "' for document '" + application + "/" + io.hasTitle + "'. Had " + largest + " counts, where as second best had " + secondLargest + " counts."); } if (found == false) { LOG.warn(counter + ". Failed utterly to find reference ID for document '" + application + "/" + io.hasTitle + "'."); // LOG.warn(io.withRawText); io.metadata.put("rawtext", io.withRawText); } } } }