// Import packages dealing with XML parsing and representation
import org.xml.sax.*;
import org.w3c.dom.*;
import org.apache.xerces.dom.*;
import org.apache.xerces.parsers.*;
import org.apache.xml.serialize.*;
// Import packages dealing with XSL transformation
import org.apache.xalan.xslt.*;
import org.apache.xalan.xpath.*;
// Import the Tidy package for HTML to XML transformation
import org.w3c.tidy.*;
// Import a few standard Java packages
import java.io.*;
import java.util.*;
import java.net.*;
/**
* XMLHelper is a class designed to provide some generic utility functions
* for working with HTML, XHTML, XML, and XSL. All methods contained herein
* are static, so no instantiation of this class is ever necessary. The methods
* deal with parsing, input/output, retrieving files from the network, and
* transformation and clean-up of documents.
*
* @author Jared Jackson, Email: jjared@almaden.ibm.com
* @see XMLHelperException
*/
public class XMLHelper {
/**
* This method creates a default XML document. The document is empty except
* for a single root element, with tag name as specified by the parameter.
*
* @param rootName The name of the root element of the XML document. If null or empty, no root element is added to the document.
* @return An empty XML document, save possibly a single root node.
*/
public static Document createXml(String rootName) {
Document doc = new DocumentImpl();
if (rootName == null || rootName.trim().equals("")) return doc;
doc.appendChild(doc.createElement(rootName));
return doc;
}
/**
* Given an URL as a String, this method retrieves
* the file located at that URL, and attempts to parse it as XML.
*
* @param url A URL encoding such as "http://www.ibm.com/someXML.xml" of the target document
* @return A parsed XML document found at the given URL
* @exception XMLHelperException Thrown if the URL is malformed, the file
* at the given URL can not be obtained, or the file found is not valid XML.
*/
public static Document parseXMLFromURLString(String url) throws XMLHelperException {
return parseXMLFromURL(convertStringToURL(url));
}
/**
* Given an URL, this method retrieves
* the file located at that URL, and attempts to parse it as XML.
*
* @param url A URL java class instantiation of the target document
* @return A parsed XML document found at the given URL
* @exception XMLHelperException Thrown if the URL is malformed, the file
* at the given URL can not be obtained, or the file found is not valid XML.
*/
public static Document parseXMLFromURL(URL url) throws XMLHelperException {
try {
URLConnection inConnection = url.openConnection();
InputSource is = new InputSource(inConnection.getInputStream());
return parseXMLFromInputSource(is);
} catch (IOException ioe) {
throw new XMLHelperException("Unable to read from source string", ioe);
}
}
/**
* Given an XML document currently unparsed in the form of a String,
* this method attempts to parse the content of that String as XML.
*
* @param source A String encoding of a XML document.
* @return A parsed XML document
* @exception XMLHelperException Thrown if the string given is not valid XML.
*/
public static Document parseXMLFromString(String source) throws XMLHelperException {
InputSource is = new InputSource(new StringReader(source));
return parseXMLFromInputSource(is);
}
/**
* Given an XML document pointed to by a File object, this method
* attemps to read the file and parse it as XML.
*
* @param sourceFile A File object referencing an XML file.
* @return A parsed XML document
* @exception XMLHelperException Thrown if the file is unreadable or the file does not contain a valid XML document
*/
public static Document parseXMLFromFile(File sourceFile) throws XMLHelperException {
InputSource is = null;
try {
is = new InputSource(new FileInputStream(sourceFile));
} catch (IOException ioe) {
throw new XMLHelperException("The XML file could not be retrieved", ioe);
}
return parseXMLFromInputSource(is);
}
/**
* Given an XML document pointed to by a file path expression, this method
* attemps to read the file and parse it as XML.
*
* @param sourceFile An absolute or relative file path expression.
* @return A parsed XML document
* @exception XMLHelperException Thrown if the file is unreadable or the file does not contain a valid XML document
*/
public static Document parseXMLFromFile(String sourceFile) throws XMLHelperException {
InputSource is = null;
try {
is = new InputSource(new FileInputStream(sourceFile));
} catch (IOException ioe) {
throw new XMLHelperException("The XML file could not be retrieved", ioe);
}
return parseXMLFromInputSource(is);
}
// This is the real work horse around XML parsing, the public methods each attempt to
// create InputSource objects, then call this method for parsing
private static Document parseXMLFromInputSource(InputSource is) throws XMLHelperException {
Document doc = null;
try {
DOMParser parser = new DOMParser();
parser.parse(is);
doc = parser.getDocument();
} catch (IOException ioe) {
throw new XMLHelperException("Unable to read from source string", ioe);
} catch (SAXException saxe) {
throw new XMLHelperException("Unable to parse the given string", saxe);
}
return doc;
}
/**
* Given two XML documents, one the target XML file and one an XSL file, this method
* applies an XSL transform defined by the XSL file on the XML file and returns the
* resulting document.
*
* @param xmlDoc The source XML file
* @param xslDoc An XML file that also follows the XSL transformation language specification
* @return The document resulting from applying xslDoc to xmlDoc.
* @exception XMLHelperException Thrown if the XSL document is either poorly formed as XSL or if it encounters an error during transformation.
*/
public static Document transformXML(Document xmlDoc, Document xslDoc) throws XMLHelperException {
try {
XSLTInputSource xmlIn = new XSLTInputSource(xmlDoc);
XSLTInputSource xslIn = new XSLTInputSource(xslDoc);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
XSLTResultTarget xmlOut = new XSLTResultTarget(baos);
XSLTProcessor processor = XSLTProcessorFactory.getProcessor();
processor.process(xmlIn, xslIn, xmlOut);
baos.close();
String result = baos.toString();
return parseXMLFromString(result);
} catch (SAXException saxe) {
throw new XMLHelperException("Unable to perform transform", saxe);
} catch (IOException ioe) {
throw new XMLHelperException("Unable to perform transform", ioe);
}
}
/**
* Given an XML document, a pretty (tab delimited and with line breaks) representation is
* sent to the specified PrintStream object. This is the most convenient way to
* output an XML document to standard out.
*
* @param doc The XML document to output
* @param stream The stream to send the result to. (e.g. System.out or System.err)
* @exception XMLHelperException Thrown in the event of an I/O error.
*/
public static void outputXML(Document doc, PrintStream stream) throws XMLHelperException {
try {
OutputFormat of = new OutputFormat(doc);
of.setIndenting(true);
XMLSerializer serializer = new XMLSerializer(stream, of);
serializer.serialize(doc);
} catch (IOException ioe) {
throw new XMLHelperException("Unable to write to the given print stream", ioe);
}
}
/**
* Given an XML document and a relative or absolute path name for a file, writes
* the XML document to that file location. The format of the written XML document
* will be tab delimited and line breaked. The file name will need to use the system
* dependent separator character(s) for directory navigation.
*
* @param doc The XML document to output.
* @param fileName A file name either relative to the running Java virtual machine, or absolute.
* @exception XMLHelperException Thrown if an I/O error occurs.
*/
public static void outputXMLToFile(Document doc, String fileName) throws XMLHelperException {
try {
OutputFormat of = new OutputFormat(doc);
of.setIndenting(true);
File f = new File(fileName);
FileOutputStream fos = new FileOutputStream(f);
XMLSerializer serializer = new XMLSerializer(fos, of);
serializer.serialize(doc);
fos.close();
} catch (IOException ioe) {
throw new XMLHelperException("Unable to write to the given file", ioe);
}
}
/**
* A utility method for converting an XML document to a String object.
* This method is included in case the user would like to do their own I/O in a way
* not specified in this class.
*
* @param doc The XML document to be encoded as a String.
* @return The XML document as text in a String.
*/
public static String convertXMLToString(Document doc) throws XMLHelperException {
try {
OutputFormat of = new OutputFormat(doc);
of.setIndenting(true);
StringWriter sw = new StringWriter();
XMLSerializer serializer = new XMLSerializer(sw, of);
serializer.serialize(doc);
return sw.toString();
} catch (IOException ioe) {
throw new XMLHelperException("Unable to write to the string", ioe);
}
}
/**
*
Copies the content of one XML Element to another. By setting the
* childrenOnly parameter to false, the element is simply
* imported as a child to the original element. If the parameter is instead
* true, then the children of the element are imported as children
* to the original element.
For instance, let the mergeToXML parameter be the XML:
*** <Original> * </Original> **
And let the mergeFromXML parameter be the XML:
*** <Target> * <Child>Child 1</Child> * <Child>Child 2</Child> * </Target> **
If childrenOnly is set to false, the result is:
*** <Original> * <Target> * <Child>Child 1</Child> * <Child>Child 2</Child> * </Target> * </Original> **
Otherwise, if childrenOnly is set to true, the result is:
** * @param mergeToXML The element into which the XML will be inserted as children. * @param mergeFromXML The element from which the XML will be copied and imported. * @param childrenOnly If* <Original> * <Child>Child 1</Child> * <Child>Child 2</Child> * </Original> **
true grab the children and ignore the parent. If false grab everything.
*/
public static void mergeXML(Element mergeToXML, Element mergeFromXML, boolean childrenOnly) {
Document toDoc = mergeToXML.getOwnerDocument();
Element copyElem = (Element)(toDoc.importNode(mergeFromXML,true));
if (childrenOnly) {
NodeList nlist = copyElem.getChildNodes();
for (int i=0; i < nlist.getLength(); i++) {
org.w3c.dom.Node n = nlist.item(i);
mergeToXML.appendChild(n);
}
return;
} else {
mergeToXML.appendChild(copyElem);
}
}
/**
* Retrieves an HTML page from a URL encoded as a String and
* attempts to clean up the source of that HTML to remove author errors. If
* successful, the resulting document is converted to XHTML and returned as
* an XML document.
*
* @param url A String encoding of a URL (e.g. "http://www.ibm.com/index.html").
* @return an XML document representing the XHTML of the source of the HTML file.
* @exception XMLHelperException Thrown if the URL is malformed, the HTML source can not be obtained, or the tool is unable to convert the source to XML.
*/
public static Document tidyHTML(String url) throws XMLHelperException {
return tidyHTML(convertStringToURL(url));
}
/**
* Retrieves an HTML page from a java URL object and
* attempts to clean up the source of that HTML to remove author errors. If
* successful, the resulting document is converted to XHTML and returned as
* an XML document.
*
* @param url A URL object hopefully pointing to an HTML file.
* @return an XML document representing the XHTML of the source of the HTML file.
* @exception XMLHelperException Thrown if the HTML source can not be obtained or the tool is unable to convert the source to XML.
*/
public static Document tidyHTML(URL url) throws XMLHelperException {
try {
URLConnection inConnection = url.openConnection();
if (inConnection.getContentType().startsWith("text/xml") ||
inConnection.getContentType().startsWith("text/xhtml")) {
// All ready an XML source
return parseXMLFromURL(url);
} else if (inConnection.getContentType().startsWith("text/html")) {
// An HTML source
InputStream is = inConnection.getInputStream();
// Clean the input stream
ByteArrayOutputStream out = new ByteArrayOutputStream();
int totalBytes = 0;
byte[] buffer = new byte[16384];
while (true) {
int bytesRead = is.read(buffer, 0, buffer.length);
if (bytesRead < 0) break;
// Remove binary below space except tab and newline
for (int i=0; i < bytesRead; i++) {
byte b = buffer[i];
if (b < 32 && b!= 10 && b != 13 && b != 9) b = 32;
buffer[i] = b;
}
out.write(buffer, 0, bytesRead);
totalBytes += bytesRead;
}
is.close();
out.close();
String outContent = out.toString();
InputStream in = new ByteArrayInputStream(out.toByteArray());
org.w3c.tidy.TagTable tags = org.w3c.tidy.TagTable.getDefaultTagTable();
tags.defineBlockTag("script");
Tidy tidy = new Tidy();
tidy.setShowWarnings(false);
tidy.setXmlOut(true);
tidy.setXmlPi(false);
tidy.setDocType("omit");
tidy.setXHTML(false);
tidy.setRawOut(true);
tidy.setNumEntities(true);
tidy.setQuiet(true);
tidy.setFixComments(true);
tidy.setIndentContent(true);
tidy.setCharEncoding(org.w3c.tidy.Configuration.ASCII);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
org.w3c.tidy.Node tNode = tidy.parse(in, baos);
String result = "\n" +
baos.toString();
// Strip the DOCTYPE and script elements - This is an optional step
int startIndex = 0;
int endIndex = 0;
if ((startIndex = result.indexOf("= 0) {
endIndex = result.indexOf(">",startIndex);
result = result.substring(0,startIndex) +
result.substring(endIndex + 1, result.length());
}
while ((startIndex = result.indexOf("");
result = result.substring(0,startIndex) +
result.substring(endIndex + 9, result.length());
}
in.close();
baos.close();
return parseXMLFromString(result);
} else {
throw new XMLHelperException("Unable to tidy content type: " +
inConnection.getContentType());
}
} catch (IOException ioe) {
throw new XMLHelperException("Unable to perform input/output", ioe);
}
}
// A utility method for converting a String encoding of a URL to a URL
private static URL convertStringToURL(String url) throws XMLHelperException {
try {
return new URL(url);
} catch (MalformedURLException murle) {
throw new XMLHelperException(url + " is not a well formed URL", murle);
}
}
}