// Import packages dealing with XML parsing and representation import org.xml.sax.*; import org.w3c.dom.*; import org.apache.xerces.dom.*; import org.apache.xerces.parsers.*; import org.apache.xml.serialize.*; // Import packages dealing with XSL transformation import org.apache.xalan.xslt.*; import org.apache.xalan.xpath.*; // Import the Tidy package for HTML to XML transformation import org.w3c.tidy.*; // Import a few standard Java packages import java.io.*; import java.util.*; import java.net.*; /** * XMLHelper is a class designed to provide some generic utility functions * for working with HTML, XHTML, XML, and XSL. All methods contained herein * are static, so no instantiation of this class is ever necessary. The methods * deal with parsing, input/output, retrieving files from the network, and * transformation and clean-up of documents. * * @author Jared Jackson, Email: jjared@almaden.ibm.com * @see XMLHelperException */ public class XMLHelper { /** * This method creates a default XML document. The document is empty except * for a single root element, with tag name as specified by the parameter. * * @param rootName The name of the root element of the XML document. If null or empty, no root element is added to the document. * @return An empty XML document, save possibly a single root node. */ public static Document createXml(String rootName) { Document doc = new DocumentImpl(); if (rootName == null || rootName.trim().equals("")) return doc; doc.appendChild(doc.createElement(rootName)); return doc; } /** * Given an URL as a String, this method retrieves * the file located at that URL, and attempts to parse it as XML. * * @param url A URL encoding such as "http://www.ibm.com/someXML.xml" of the target document * @return A parsed XML document found at the given URL * @exception XMLHelperException Thrown if the URL is malformed, the file * at the given URL can not be obtained, or the file found is not valid XML. */ public static Document parseXMLFromURLString(String url) throws XMLHelperException { return parseXMLFromURL(convertStringToURL(url)); } /** * Given an URL, this method retrieves * the file located at that URL, and attempts to parse it as XML. * * @param url A URL java class instantiation of the target document * @return A parsed XML document found at the given URL * @exception XMLHelperException Thrown if the URL is malformed, the file * at the given URL can not be obtained, or the file found is not valid XML. */ public static Document parseXMLFromURL(URL url) throws XMLHelperException { try { URLConnection inConnection = url.openConnection(); InputSource is = new InputSource(inConnection.getInputStream()); return parseXMLFromInputSource(is); } catch (IOException ioe) { throw new XMLHelperException("Unable to read from source string", ioe); } } /** * Given an XML document currently unparsed in the form of a String, * this method attempts to parse the content of that String as XML. * * @param source A String encoding of a XML document. * @return A parsed XML document * @exception XMLHelperException Thrown if the string given is not valid XML. */ public static Document parseXMLFromString(String source) throws XMLHelperException { InputSource is = new InputSource(new StringReader(source)); return parseXMLFromInputSource(is); } /** * Given an XML document pointed to by a File object, this method * attemps to read the file and parse it as XML. * * @param sourceFile A File object referencing an XML file. * @return A parsed XML document * @exception XMLHelperException Thrown if the file is unreadable or the file does not contain a valid XML document */ public static Document parseXMLFromFile(File sourceFile) throws XMLHelperException { InputSource is = null; try { is = new InputSource(new FileInputStream(sourceFile)); } catch (IOException ioe) { throw new XMLHelperException("The XML file could not be retrieved", ioe); } return parseXMLFromInputSource(is); } /** * Given an XML document pointed to by a file path expression, this method * attemps to read the file and parse it as XML. * * @param sourceFile An absolute or relative file path expression. * @return A parsed XML document * @exception XMLHelperException Thrown if the file is unreadable or the file does not contain a valid XML document */ public static Document parseXMLFromFile(String sourceFile) throws XMLHelperException { InputSource is = null; try { is = new InputSource(new FileInputStream(sourceFile)); } catch (IOException ioe) { throw new XMLHelperException("The XML file could not be retrieved", ioe); } return parseXMLFromInputSource(is); } // This is the real work horse around XML parsing, the public methods each attempt to // create InputSource objects, then call this method for parsing private static Document parseXMLFromInputSource(InputSource is) throws XMLHelperException { Document doc = null; try { DOMParser parser = new DOMParser(); parser.parse(is); doc = parser.getDocument(); } catch (IOException ioe) { throw new XMLHelperException("Unable to read from source string", ioe); } catch (SAXException saxe) { throw new XMLHelperException("Unable to parse the given string", saxe); } return doc; } /** * Given two XML documents, one the target XML file and one an XSL file, this method * applies an XSL transform defined by the XSL file on the XML file and returns the * resulting document. * * @param xmlDoc The source XML file * @param xslDoc An XML file that also follows the XSL transformation language specification * @return The document resulting from applying xslDoc to xmlDoc. * @exception XMLHelperException Thrown if the XSL document is either poorly formed as XSL or if it encounters an error during transformation. */ public static Document transformXML(Document xmlDoc, Document xslDoc) throws XMLHelperException { try { XSLTInputSource xmlIn = new XSLTInputSource(xmlDoc); XSLTInputSource xslIn = new XSLTInputSource(xslDoc); ByteArrayOutputStream baos = new ByteArrayOutputStream(); XSLTResultTarget xmlOut = new XSLTResultTarget(baos); XSLTProcessor processor = XSLTProcessorFactory.getProcessor(); processor.process(xmlIn, xslIn, xmlOut); baos.close(); String result = baos.toString(); return parseXMLFromString(result); } catch (SAXException saxe) { throw new XMLHelperException("Unable to perform transform", saxe); } catch (IOException ioe) { throw new XMLHelperException("Unable to perform transform", ioe); } } /** * Given an XML document, a pretty (tab delimited and with line breaks) representation is * sent to the specified PrintStream object. This is the most convenient way to * output an XML document to standard out. * * @param doc The XML document to output * @param stream The stream to send the result to. (e.g. System.out or System.err) * @exception XMLHelperException Thrown in the event of an I/O error. */ public static void outputXML(Document doc, PrintStream stream) throws XMLHelperException { try { OutputFormat of = new OutputFormat(doc); of.setIndenting(true); XMLSerializer serializer = new XMLSerializer(stream, of); serializer.serialize(doc); } catch (IOException ioe) { throw new XMLHelperException("Unable to write to the given print stream", ioe); } } /** * Given an XML document and a relative or absolute path name for a file, writes * the XML document to that file location. The format of the written XML document * will be tab delimited and line breaked. The file name will need to use the system * dependent separator character(s) for directory navigation. * * @param doc The XML document to output. * @param fileName A file name either relative to the running Java virtual machine, or absolute. * @exception XMLHelperException Thrown if an I/O error occurs. */ public static void outputXMLToFile(Document doc, String fileName) throws XMLHelperException { try { OutputFormat of = new OutputFormat(doc); of.setIndenting(true); File f = new File(fileName); FileOutputStream fos = new FileOutputStream(f); XMLSerializer serializer = new XMLSerializer(fos, of); serializer.serialize(doc); fos.close(); } catch (IOException ioe) { throw new XMLHelperException("Unable to write to the given file", ioe); } } /** * A utility method for converting an XML document to a String object. * This method is included in case the user would like to do their own I/O in a way * not specified in this class. * * @param doc The XML document to be encoded as a String. * @return The XML document as text in a String. */ public static String convertXMLToString(Document doc) throws XMLHelperException { try { OutputFormat of = new OutputFormat(doc); of.setIndenting(true); StringWriter sw = new StringWriter(); XMLSerializer serializer = new XMLSerializer(sw, of); serializer.serialize(doc); return sw.toString(); } catch (IOException ioe) { throw new XMLHelperException("Unable to write to the string", ioe); } } /** *

Copies the content of one XML Element to another. By setting the * childrenOnly parameter to false, the element is simply * imported as a child to the original element. If the parameter is instead * true, then the children of the element are imported as children * to the original element.

* *

For instance, let the mergeToXML parameter be the XML:

*
*
	 * <Original>
	 * </Original>
	 * 
*
*

And let the mergeFromXML parameter be the XML:

*
*
	 * <Target>
	 *   <Child>Child 1</Child>
	 *   <Child>Child 2</Child>
	 * </Target>
	 * 
*
*

If childrenOnly is set to false, the result is:

*
*
	 * <Original>
	 *   <Target>
	 *     <Child>Child 1</Child>
	 *     <Child>Child 2</Child>
	 *   </Target>
	 * </Original>
	 * 
*
*

Otherwise, if childrenOnly is set to true, the result is:

*
*
	 * <Original>
	 *   <Child>Child 1</Child>
	 *   <Child>Child 2</Child>
	 * </Original>
	 * 
*
* * @param mergeToXML The element into which the XML will be inserted as children. * @param mergeFromXML The element from which the XML will be copied and imported. * @param childrenOnly If true grab the children and ignore the parent. If false grab everything. */ public static void mergeXML(Element mergeToXML, Element mergeFromXML, boolean childrenOnly) { Document toDoc = mergeToXML.getOwnerDocument(); Element copyElem = (Element)(toDoc.importNode(mergeFromXML,true)); if (childrenOnly) { NodeList nlist = copyElem.getChildNodes(); for (int i=0; i < nlist.getLength(); i++) { org.w3c.dom.Node n = nlist.item(i); mergeToXML.appendChild(n); } return; } else { mergeToXML.appendChild(copyElem); } } /** * Retrieves an HTML page from a URL encoded as a String and * attempts to clean up the source of that HTML to remove author errors. If * successful, the resulting document is converted to XHTML and returned as * an XML document. * * @param url A String encoding of a URL (e.g. "http://www.ibm.com/index.html"). * @return an XML document representing the XHTML of the source of the HTML file. * @exception XMLHelperException Thrown if the URL is malformed, the HTML source can not be obtained, or the tool is unable to convert the source to XML. */ public static Document tidyHTML(String url) throws XMLHelperException { return tidyHTML(convertStringToURL(url)); } /** * Retrieves an HTML page from a java URL object and * attempts to clean up the source of that HTML to remove author errors. If * successful, the resulting document is converted to XHTML and returned as * an XML document. * * @param url A URL object hopefully pointing to an HTML file. * @return an XML document representing the XHTML of the source of the HTML file. * @exception XMLHelperException Thrown if the HTML source can not be obtained or the tool is unable to convert the source to XML. */ public static Document tidyHTML(URL url) throws XMLHelperException { try { URLConnection inConnection = url.openConnection(); if (inConnection.getContentType().startsWith("text/xml") || inConnection.getContentType().startsWith("text/xhtml")) { // All ready an XML source return parseXMLFromURL(url); } else if (inConnection.getContentType().startsWith("text/html")) { // An HTML source InputStream is = inConnection.getInputStream(); // Clean the input stream ByteArrayOutputStream out = new ByteArrayOutputStream(); int totalBytes = 0; byte[] buffer = new byte[16384]; while (true) { int bytesRead = is.read(buffer, 0, buffer.length); if (bytesRead < 0) break; // Remove binary below space except tab and newline for (int i=0; i < bytesRead; i++) { byte b = buffer[i]; if (b < 32 && b!= 10 && b != 13 && b != 9) b = 32; buffer[i] = b; } out.write(buffer, 0, bytesRead); totalBytes += bytesRead; } is.close(); out.close(); String outContent = out.toString(); InputStream in = new ByteArrayInputStream(out.toByteArray()); org.w3c.tidy.TagTable tags = org.w3c.tidy.TagTable.getDefaultTagTable(); tags.defineBlockTag("script"); Tidy tidy = new Tidy(); tidy.setShowWarnings(false); tidy.setXmlOut(true); tidy.setXmlPi(false); tidy.setDocType("omit"); tidy.setXHTML(false); tidy.setRawOut(true); tidy.setNumEntities(true); tidy.setQuiet(true); tidy.setFixComments(true); tidy.setIndentContent(true); tidy.setCharEncoding(org.w3c.tidy.Configuration.ASCII); ByteArrayOutputStream baos = new ByteArrayOutputStream(); org.w3c.tidy.Node tNode = tidy.parse(in, baos); String result = "\n" + baos.toString(); // Strip the DOCTYPE and script elements - This is an optional step int startIndex = 0; int endIndex = 0; if ((startIndex = result.indexOf("= 0) { endIndex = result.indexOf(">",startIndex); result = result.substring(0,startIndex) + result.substring(endIndex + 1, result.length()); } while ((startIndex = result.indexOf("= 0) { endIndex = result.indexOf(""); result = result.substring(0,startIndex) + result.substring(endIndex + 9, result.length()); } in.close(); baos.close(); return parseXMLFromString(result); } else { throw new XMLHelperException("Unable to tidy content type: " + inConnection.getContentType()); } } catch (IOException ioe) { throw new XMLHelperException("Unable to perform input/output", ioe); } } // A utility method for converting a String encoding of a URL to a URL private static URL convertStringToURL(String url) throws XMLHelperException { try { return new URL(url); } catch (MalformedURLException murle) { throw new XMLHelperException(url + " is not a well formed URL", murle); } } }