/** * RDFaExtractorCore.java * * Created on 2006-11-12 * * Distribution and Usage * ====================== * This software may be used in terms of the CC Attribution 2.5 License * as stated at http://creativecommons.org/licenses/by/2.5/ * * Credits * ======= * This software is built on top of other software which I gratefully acknowledge: * Jena 2 (cf. http://jena.sourceforge.net/license.html for details) is used for RDF processing * and JDOM (cf. JDOM Project http://www.jdom.org/ for details) is used for XML processing. * */ package org.sw_app.sw; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Vector; import org.jdom.Attribute; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.Namespace; import org.jdom.input.SAXBuilder; import org.jdom.output.XMLOutputter; import org.jdom.xpath.XPath; import org.jdom.Document; import com.hp.hpl.jena.rdf.model.AnonId; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.RDFNode; import com.hp.hpl.jena.rdf.model.RDFWriter; import com.hp.hpl.jena.rdf.model.ReifiedStatement; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.Statement; import com.hp.hpl.jena.util.FileManager; import com.hp.hpl.jena.vocabulary.RDF; /** * RDFaExtractorCore * *

This class implements the RDFa syntax according to @link http://www.w3.org/2006/07/SWD/RDFa/syntax/ . * It allows to process either a single or a list of (local or remote) XHTML document(s) that may contain(s) RDFa elements. * The output of the processing is a RDF graph that contains all RDF triples that are implicit to the input XHTML. * * @author Michael Hausenblas * @link http://creativecommons.org/licenses/by/2.5/ */ public class RDFaExtractorCore { /** * Turns XHTML tree processing display ON/OFF: * */ boolean doDumpXHTMLProcessing = false; /** * The URI of the input HTML document. */ URI inputDocURI = null; /** * The main RDF graph. */ Model rdfGraph = null; /** * Base URI for serializing. */ String baseURI = ""; /** * */ HashMap nsMap = null; /** * The 6 RDFa elements due to are: * about, property, rel, rev, href, content (datatpye only in conjunction with property) */ public static final String RDFa_about = "about"; public static final String RDFa_prop = "property"; public static final String RDFa_rel = "rel"; public static final String RDFa_rev = "rev"; public static final String RDFa_href = "href"; public static final String RDFa_content = "content"; public static final String RDFa_datatype = "datatype"; public static final String RDFa_element_meta = "meta"; public static final String RDFa_element_link = "link"; public static final String HTML_root = "html"; // OLD VERSION, looks at all RDFa elements /* public static final String RDFa_elementSelector = "//*[@" + RDFa_about + " | "+ "@" + RDFa_prop + " | "+ "@" + RDFa_rel + " | "+ "@" + RDFa_rev + " | "+ "@" + RDFa_href + " | "+ "@" + RDFa_content + "]"; */ /** * NEW version: only look at attributes that CAN trigger a triple. * Due to these are: property, rel, rev */ public static final String RDFa_elementSelector = "//*[@" + RDFa_prop + " | "+ "@" + RDFa_rel + " | "+ "@" + RDFa_rev + "]"; public static final String XMLLiteral = "XMLLiteral"; /** * Simple constructor. * */ public RDFaExtractorCore(boolean doDumpXHTMLProcessing){ this.rdfGraph = ModelFactory.createDefaultModel(); this.baseURI = ""; this.nsMap = new HashMap(); this.doDumpXHTMLProcessing = doDumpXHTMLProcessing; } /** * Reads an HTML input document (locally or remote via URL) and creates a JDOM representation out of it. * * @return */ private Document createHMTLDocument() { SAXBuilder builder = new SAXBuilder(); Document doc = null; URL inFileURL = null; String inFileName = ""; String localPrefix = "file"; System.out.println("Parsing HTML file result : "); try { inFileURL = this.getInputDocument().toURL(); inFileName = new File(inFileURL.getFile()).getAbsolutePath(); if(inFileURL.getProtocol().startsWith(localPrefix)){// workaround due to JDOM bug; use local file path doc = builder.build(inFileName); System.out.println(inFileName + " seems to be well-formed."); } else { // use URL/HTTP access doc = builder.build(inFileURL); System.out.println(inFileURL.toExternalForm() + " seems to be well-formed."); } // TODO: xml:base URI handling this.setBaseURI(inFileURL.toExternalForm()); // use the document URL as base URI } catch (MalformedURLException mue) { System.out.println(inFileName + " seems NOT to be a well-formed URL."); mue.printStackTrace(); } catch (JDOMException je) { System.out.println(inFileName + " seems NOT to be well-formed."); System.out.println(je.getMessage()); } catch (IOException ioe) { System.out.println("Could not check " + inFileName); System.out.println(" due to " + ioe.getMessage()); } return doc; } /** * * @return */ public File serializeGraph(String fileName, String serFormat){ File tmpFile = new File(fileName); RDFWriter w = this.rdfGraph.getWriter(serFormat); if(serFormat.startsWith("RDF")){ w.setProperty("showXMLDeclaration","true"); w.setProperty("tab","1"); } /* if(serFormat.startsWith("N3")|| serFormat.equals("TURTLE")){ w.setProperty("abbrevBaseURI","false"); } */ try { w.write(this.rdfGraph, new PrintWriter(tmpFile), this.getBaseURI()); } catch (FileNotFoundException e) { e.printStackTrace(); } return tmpFile; } /** * Process a list of XHTML documents. * * @param docList * @param outFormat * @param doDumpDetails */ public void processMultiple(Vector docList, String outFormat, boolean doDumpDetails){ // preformance long start = 0L; long end = 0L; int diff = 0; for (Iterator iter = docList.iterator(); iter.hasNext();) { URI docURI = (URI) iter.next(); this.setInputDocument(docURI); start = System.currentTimeMillis(); if(this.process() != null){ end = System.currentTimeMillis(); System.out.println("\n==========================================================="); System.out.println("\nUsing BASE URI: " + this.getBaseURI()); System.out.println("\nResulting RDF graph : "); System.out.println(this.getRDFGraphDump(outFormat)); } else { System.out.println("HTML input document not valid!"); } diff = (int) (end - start); System.out.println("Process time: " + diff + "ms"); } } /** * Takes a HTML input document and processes it. * Returns an RDF graph if any RDFa is present in the HTML input document. * * @param htmlInputFileURL The HTML input document URL */ public Model process() { Document doc = null; Namespace ns = null; Element root = null; List rdfaElementList = null; XPath elementsWithAttributes = null; if(this.getInputDocument() == null) return null; // no input document set else { doc = this.createHMTLDocument(); root = doc.getRootElement(); // handle global namespaces if(this.doDumpXHTMLProcessing) System.out.println("\nResult of processing XHTML top level namespaces :"); //ns = root.getNamespace(); this.addNSofElements(doc); this.rdfGraph.setNsPrefix("", this.getBaseURI()); // add the base URI of the document as the empty prefix // process XHTML tree if(this.doDumpXHTMLProcessing) System.out.println("\nResult of processing XHTML tree :"); try { // select all XHTML elements that have relevant RDFa elements inside elementsWithAttributes = XPath.newInstance(RDFaExtractorCore.RDFa_elementSelector); rdfaElementList = elementsWithAttributes.selectNodes(doc); if(rdfaElementList.isEmpty()){ // no RDFa elements found if(this.doDumpXHTMLProcessing) System.out.println("Found no RDFa in the input HTML document."); return null; } else{ // process all RDFa elements iterativly Iterator itEwA = rdfaElementList.iterator(); while(itEwA.hasNext()) { processElement((Element)itEwA.next()); } } } catch (JDOMException e) { e.printStackTrace(); } } return this.rdfGraph; } /** * Processes a single XHTML element. Evaluates its attributes and add RDF triples according to * * * @param e The current XHTML element being processed. * @param num The number of the element being processed. */ private void processElement(Element e){ Property pProp = null; Property pRel = null; Property pRev = null; Resource s = null; RDFNode o = null; String propURI = ""; String relURI = ""; String revURI = ""; String aboutURI = ""; String contentVal =""; String contentType =""; String hrefURI = ""; String lang = ""; boolean doReification = false; if(this.doDumpXHTMLProcessing) System.out.println(" Processing element: " + e.getName()); // add additional, local namespaces // this.addNSofElement(e); //DEPRECATED // === FIRST STEP: evaluate the attributes to get the URIs and values === // I. Establishing the predicate, cf. propURI = this.evaluateAttributeAtElement(e, RDFaExtractorCore.RDFa_prop); // try to extract predicate from @property relURI = this.evaluateAttributeAtElement(e, RDFaExtractorCore.RDFa_rel); // try to extract predicate from @rel revURI = this.evaluateAttributeAtElement(e, RDFaExtractorCore.RDFa_rev); // try to extract predicate from @content // II. Establishing the subject/object, cf. aboutURI = this.evaluateAttributeAtElement(e, RDFaExtractorCore.RDFa_about); // try to extract subject from @about // handle xml:base if(!aboutURI.equals("") && this.getXMLBaseFromElement(e) != null) aboutURI += this.getXMLBaseFromElement(e); if(aboutURI.equals("")) { // the element itself has NO @about // CASE and ; try to extract subject/object from context statement // due to if(this.isMetaOrLink(e)) { Element parent = e.getParentElement(); // CASE parent is a and itself; cf. if(this.isMetaOrLink(parent)) { aboutURI = this.evaluateAttributeAtElement(parent, RDFaExtractorCore.RDFa_about); // try to extract subject/object from a parent if(aboutURI.equals("")) { // parent does not have an @about doReification = true; } //if(this.doDumpXHTMLProcessing) System.out.println(" NOTE: reification due to parent or element currently not supported !"); } // CASE any other XHTML element; try to extract subject/object from context statement else{ aboutURI = this.evaluateAttributeAtElement(parent, RDFaExtractorCore.RDFa_about); // try to extract subject/object from a parent if(aboutURI.equals("")) { // parent does not have an @about aboutURI = this.getXMLIDFromElement(parent); // try to extract subject/object from parent's @xml:id if(aboutURI == null) { // parent has no xml:id aboutURI = this.createBlankNodeID(parent); // use the parent element as a bNode } } if(this.getXMLBaseFromElement(parent) != null){ // parent has a xml:base; use this instead of bNode (?) aboutURI = this.getXMLBaseFromElement(parent) ; } } } // CASE any other XHTML element; try to extract subject/object from an ancestor else{ // if current element has no @about, perform recursive to find the closest ancestor with an @about boolean foundAbout = false; Element parent = e.getParentElement(); while(!foundAbout){ if(parent.getName().equals(RDFaExtractorCore.HTML_root)){ // reached root; using the document itself as subject/object foundAbout = true; aboutURI = ""; } else { // recursive ancestor lookup aboutURI = this.evaluateAttributeAtElement(parent, RDFaExtractorCore.RDFa_about); if(!aboutURI.equals("")) { // found an @about at an ancestor foundAbout = true; } else{ parent = parent.getParentElement(); } } } } } // III. Establishing the object/subject, cf. // CASE literal object if(!propURI.equals("")){ // we have a @property set, check for a literal value // due to two cases have to be // distinguished: without or with @datatype contentType = this.evaluateAttributeAtElement(e, RDFaExtractorCore.RDFa_datatype); // try to determine @datatype contentVal = this.evaluateAttributeAtElement(e, RDFaExtractorCore.RDFa_content); // try to extract literal object value from @content lang = this.getXMLLanguageFromElement(e); // try to extract @xml:lang from element if(contentType.equals("")) { // element has NO @datatype if(contentVal.equals("")) { // element has NO @content // TODO: make XMLLiteral handling spec conformant //contentVal = this.createXMLLiteralFromElement(e); // try to extract object value from element's content as XMLLiteral contentVal = e.getTextTrim(); contentType = RDFaExtractorCore.XMLLiteral; } else { // plain literal; use contentVal as extracted above } } else{ // element has a @datatype if(contentVal.equals("")) { // element has NO @content contentVal = e.getTextNormalize(); // try to extract object value from element's content } else { // plain literal; use contentVal as extracted above } } } // CASE URI ref object/subject if(!relURI.equals("") || !revURI.equals("") ){ // we have a @rel/@rev set, check for a object/subject resource (URI ref) if(!this.isMetaOrLink(e)) { hrefURI = this.evaluateAttributeAtElement(e, RDFaExtractorCore.RDFa_href); // try to extract object/subject value from @href } else{ Element parent = e.getParentElement(); // CASE parent is a and itself; cf. if(this.isMetaOrLink(parent)) { hrefURI = this.evaluateAttributeAtElement(parent, RDFaExtractorCore.RDFa_href); // try to extract object/subject value from @href of parent } } } // === SECOND STEP: update the RDF graph according to the found S, P, O === if(this.doDumpXHTMLProcessing) System.out.print(" Adding TRIPLE: "); // CASE @property if(!propURI.equals("")) { s = this.createResourceFromURIRef(aboutURI); // create the subject pProp = this.rdfGraph.createProperty(propURI); // create the predicate // create the object using a literal if(lang == null){ // create simple literal without @xml:lang o = this.rdfGraph.createLiteral(contentVal); } else{ // create a literal with @xml:lang o = this.rdfGraph.createLiteral(contentVal, lang); } if(!contentType.equals("")){ // a typed literal object if(contentType.equals(RDFaExtractorCore.XMLLiteral)) { // we have an XMLLiteral o = this.rdfGraph.createLiteral(contentVal, true); } else{ // a type was specified; use it! if(contentType.equals("plaintext")) { // @datatype is plaintext o = this.rdfGraph.createLiteral(contentVal); // create simple literal without datatype } else { // @datatype is an XSD (?) o = this.rdfGraph.createTypedLiteral(contentVal, contentType); // // create typed literal } } } if(doReification){// a link/meta element inside a link/meta element this.addReifiedTriple(s, pProp, o); } else { this.addTriple(s, pProp, o); } } // CASE @rel if(!relURI.equals("")){ s = this.createResourceFromURIRef(aboutURI); // create the subject using a resource (URI ref) pRel = this.rdfGraph.createProperty(relURI); // create the predicate o = this.createResourceFromURIRef(hrefURI); // create the object using a resource (URI ref) if(doReification){// a link/meta element inside a link/meta element this.addReifiedTriple(s, pRel, o); } else{ this.addTriple(s, pRel, o); } } // CASE @rev if(!revURI.equals("")) { s = this.createResourceFromURIRef(hrefURI); // create the subject using a resource (URI ref) pRev = this.rdfGraph.createProperty(revURI); o = this.createResourceFromURIRef(aboutURI); // create the object using a resource (URI ref) if(doReification){// a link/meta element inside a link/meta element this.addReifiedTriple(s, pRev, o); } else{ this.addTriple(s, pRev, o); } } } private Resource createResourceFromURIRef(String aURIREf){ if(aURIREf.startsWith("_")){ // URI ref is a bNode AnonId anonID = new AnonId(aURIREf.substring(1)); // create anonymous ID return this.rdfGraph.createResource(anonID); } else { // URI ref is a URI indeed return this.rdfGraph.createResource(aURIREf); } } private String createXMLLiteralFromElement(Element e) { StringBuffer tmp = new StringBuffer(); //List contentList = e.getContent(); XMLOutputter xmlOutputter = new XMLOutputter(); List children = e.getChildren(); Iterator iterator = children.iterator(); while (iterator.hasNext()) { Element child = (Element) iterator.next(); tmp.append(xmlOutputter.outputString(child)); } return tmp.toString(); } private String createBlankNodeID(Element parent) { return "_" + parent.getName() + System.currentTimeMillis(); //creates random bNode ID } private void addTriple(Resource s, Property p, RDFNode o){ Statement triple = this.rdfGraph.createStatement(s, p, o); this.rdfGraph.add(triple); if(this.doDumpXHTMLProcessing) System.out.println(" Added triple: " + triple.toString()); } private void addReifiedTriple(Resource s, Property p, RDFNode o){ Statement triple = this.rdfGraph.createStatement(s, p, o); ReifiedStatement rTriple = this.rdfGraph.createReifiedStatement(triple); this.rdfGraph.add(rTriple.getStatement()); if(this.doDumpXHTMLProcessing) System.out.println(" Added triple: " + triple.toString()); } private String getXMLIDFromElement(Element e){ String tmpID = e.getAttributeValue("id", Namespace.XML_NAMESPACE); // try to extract the @xml:id of element (with namespace) if(tmpID == null) tmpID = e.getAttributeValue("id"); // try to extract the @xml:id of element (without namespace) return tmpID; } private String getXMLLanguageFromElement(Element e){ String tmpLang = e.getAttributeValue("lang", Namespace.XML_NAMESPACE); // try to extract the @xml:lang of element (with namespace) if(tmpLang == null ) tmpLang = e.getAttributeValue("lang"); // try to extract the @xml:lang of element (without namespace) return tmpLang; } private String getXMLBaseFromElement(Element e){ return e.getAttributeValue("base", Namespace.XML_NAMESPACE); // try to extract the xml:base of element (with namespace) } private String evaluateAttributeAtElement(Element e, String aName) { Attribute tmpAttribute = e.getAttribute(aName); String tmpURI = ""; if(tmpAttribute != null) { // element has an attribute with the specified name if(this.isCURIE(tmpAttribute)) { // the attribute contains a CURIE tmpURI = this.normaliseCURIE(tmpAttribute); } else{ // the attribute contains a URI tmpURI = this.normaliseQName(tmpAttribute); } if(this.doDumpXHTMLProcessing) System.out.println(" @" + aName + "=" + tmpURI); } return tmpURI; } private boolean isMetaOrLink(Element e){ if(e.getName().equals(RDFaExtractorCore.RDFa_element_meta) || e.getName().equals(RDFaExtractorCore.RDFa_element_link)) return true; else return false; } private boolean isCURIE(Attribute a){ if(a.getValue().startsWith("[")) return true; else return false; } private void addNSofElements(Document doc){ XPath elements = null; try { elements = XPath.newInstance("//*"); List elementsList = elements.selectNodes(doc); if(!elementsList.isEmpty()){ // no RDFa elements found Iterator itE = elementsList.iterator(); while(itE.hasNext()) { this.addNSofSingleElement((Element)itE.next()); } } } catch (JDOMException e) { e.printStackTrace(); } } /** * Generates NS to URI mappings on an element (and put it into the global map). * * @param e */ private void addNSofSingleElement(Element e) { List nsList = e.getAdditionalNamespaces(); Iterator itNS = nsList.iterator(); if(!nsList.isEmpty()){ while(itNS.hasNext()) { Namespace current = (Namespace)itNS.next(); String nsURI = current.getURI(); String nsPrefix = current.getPrefix(); this.nsMap.put(nsPrefix, nsURI); // add to global NS map this.rdfGraph.setNsPrefix(nsPrefix, nsURI); // add to RDF graph if(this.doDumpXHTMLProcessing) System.out.println(" added NS: " + nsPrefix + " = "+ nsURI); } } } /** * Convert CURIEs to normal form. * * @param currentA * @return */ private String normaliseCURIE(Attribute currentA) { String aVal = currentA.getValue(); String nURI = ""; StringBuffer tmp = new StringBuffer(); String tmpNS = ""; if(aVal.startsWith("[")){ // this is a CURIE: [NS:LOCALPART] tmpNS = aVal.substring(1, aVal.indexOf(":")); if(tmpNS.startsWith("_")){ // bNode tmp.append(tmpNS); // add bNode reference } else{ // try to look up namespace tmp.append(this.nsMap.get(tmpNS)); } tmp.append(aVal.substring(aVal.indexOf(":") + 1, aVal.indexOf("]"))); // append local part nURI = tmp.toString(); } else { // no CURIE if (aVal.startsWith("#")){ // a fragment - append base URI nURI = this.getBaseURI() + aVal; } else nURI = aVal; // a URI already } return nURI; } private String normaliseQName(Attribute currentA){ String aVal = currentA.getValue(); StringBuffer tmp = new StringBuffer(); if(!aVal.startsWith("_")){ // a URI if(aVal.indexOf("/") == 0){ // a QName to expand tmp.append(this.nsMap.get(aVal.substring(0, aVal.indexOf(":")))); tmp.append(aVal.substring(aVal.indexOf(":")+1)); } else{ // a URI already tmp.append(aVal); } } else { // a bNode tmp.append(aVal.substring(2)); } return tmp.toString(); } private void setBaseURI(String aBaseURI) { this.baseURI = aBaseURI; } public String getBaseURI() { return this.baseURI; } public void setInputDocument(URI inputDocumentURI) { this.inputDocURI = inputDocumentURI; if(!this.rdfGraph.isEmpty()) this.rdfGraph.removeAll(); if(!this.nsMap.isEmpty()) this.nsMap.clear(); } public URI getInputDocument() { return this.inputDocURI; } public Model getRDFGraph(){ if (this.rdfGraph.isEmpty()) return null; else return this.rdfGraph; } public String getRDFGraphDump(String outFormat){ StringWriter sw = new StringWriter(); rdfGraph.write(sw, outFormat); return sw.toString(); } /** * @param args */ public static void main(String[] args) { RDFaExtractorCore rdfx = new RDFaExtractorCore(false); // web HTML input test files String webBaseDir = "http://dev.torrez.us/public/2006/rdfa/tests/"; //String webInputFileName = "1.htm"; // mixed testcase; not so sure if correct (@role)? //String webInputFileName = "2.htm"; // property, about, element-content :: PASSED //String webInputFileName = "3.htm"; // container, property, about, element-content :: PASSED //String webInputFileName = "4.htm"; // bNode container, property, about, element-content :: PASSED //String webInputFileName = "5.htm"; // xml:base container, property + rel :: PASSED //String webInputFileName = "6.htm"; // CURIE with namespace :: PASSED //String webInputFileName = "7.htm"; // rel, and rev :: PASSED //String webInputFileName = "8.htm"; // rel, rev, and property + about, and content :: PASSED //String webInputFileName = "9.htm"; // xml:base container, rdf:type, href, etc. :: PASSED //String webInputFileName = "10.htm"; // CURIE, empty about :: PASSED //String webInputFileName = "11.htm"; // rel, and about with mailto: (no namespace) :: PASSED //String webInputFileName = "12.htm"; // rel+rev, and about with mailto: (no namespace) :: PASSED //String webInputFileName = "13.htm"; // XMLLiteral :: children content handling OPEN ISSUE //String webInputFileName = "14.htm"; // xml:lang without @datatype directly at element :: PASSED //String webInputFileName = "15.htm"; // xml:lang at ancestor :: OPEN ISSUE //String webInputFileName = "16.htm"; // datatype=xsd:int :: PASSED //String webInputFileName = "17.htm"; // link, meta, bNode (with rel and property) :: PASSED //String webInputFileName = "18.htm"; // datatype=plaintext :: PASSED //String webInputFileName = "19.htm"; // bNode references (output same as 17) :: PASSED //String webInputFileName = "20.htm"; // bNode references S/P mutual :: PASSED //String webInputFileName = "21.htm"; // rel with global about :: PASSED //String webInputFileName = "22.htm"; // mixed with CURIE, URI refs, etc. :: OPEN ISSUE //String webInputFileName = "23.htm"; // meta in link (reification) :: OPEN ISSUE // local HTML input test files String baseDir = "file://C:/project/MMSEM-XG/sandbox/RDFaXtractor/workspace/RDFaX/testcases/"; //String inputFileName = "benadida.html"; // (local) RDF output file //String rdfOutputFileName = "C:/project/MMSEM-XG/sandbox/RDFaXtractor/RDFa/testcases/out/" + webInputFileName + ".rdf"; //String serFormat = "RDF/XML"; // the list of testcases (input document URIs) Vector inputDocURIList = new Vector(); int startDocNum = 23; int endDocNum = 23; try { for (int i = startDocNum; i < endDocNum +1 ; i++) { inputDocURIList.add(new URI(baseDir + i + ".htm")); } rdfx.processMultiple(inputDocURIList, "N-TRIPLE", true); } catch (URISyntaxException e) { e.printStackTrace(); } } }