Code for Pushing XML Data to a Collection

// (C) Copyright IBM Corp. 2009, 2014 All Rights Reserved.
// All rights reserved

/**
 *
 * Notes:
 *
 * Create a new collection if necessary
 * Push all xml files in a directory into the collection
 *
 * ALL AUTHENTICATED OPERATIONS DONE WITH the user/password
 *  apitestuser/testpw, which is hardwired into this app. You
 *  should either create this user in the Watson Explorer Engine installation
 *  that you are testing with, or update the code for this
 *  sample application, substituting a valid user/password
 *  combination for that Watson Explorer Engine installation.
 *
 */

import velocity.*;
import velocity.objects.*;
import velocity.soap.*;
import velocity.types.*;

import java.util.Enumeration;
import java.io.File;
import java.io.FileInputStream;

import java.net.URL;
import javax.xml.namespace.QName;
import javax.xml.stream.*;

public class XmlPush {

    static String endpoint = "http://HOSTNAME/vivisimo/cgi-bin/velocity?v.app=api-soap&";
    static java.lang.Boolean verbose = false;


    public static void usage() {
        System.out.println();
        System.out.println("usage:  XmlPush [-v] [-e endpoint] collection filename(s)\n");
        if ("http://HOSTNAME/vivisimo/cgi-bin/velocity?v.app=api-soap&".equals(endpoint)) {
            System.out.println("SOAP endpoint has not been changed in the source code.\nYou will need to specify an endpoint on the commandline.");
        }
        System.out.println();
    }


    public static void main(String[] args) {

        String collection = "";
        java.util.Vector<File> filesToPush = new java.util.Vector<File>();

        /* parse commandline */
        if (args.length == 0) {
            usage();
            System.exit(1);
        } else {
            int i;
            for (i=0; i < args.length; i++) {
                if ("-v".equals(args[i])) {
                    verbose = true;
                } else if ("-e".equals(args[i])) {
                    if ( i+1 < args.length ) {
                        i++;
                        endpoint = args[i];
                    } else {
                        usage();
                        System.exit(1);
                    }
                } else {
                    collection = args[i];
                    i++;
                    if ( i >= args.length ) {
                        usage();
                        System.exit(1);
                    }
                    for ( ; i < args.length; i++) {
                        File f = new File(args[i]);
                        if ( ! f.canRead() ) {
                            try {
                                System.out.println("WARNING:  can't read file ["+f.getCanonicalPath()+"].  Skipping.");
                            } catch (java.io.IOException ioe) {
                                System.out.println("WARNING:  exception caught processing filename ["+args[i]+"].  Skipping.");
                            } 
                        } else if ( f.isDirectory() ) {
                            try {
                                System.out.println("WARNING:  ["+f.getCanonicalPath()+"] is a directory, not a file.  Skipping.");
                            } catch (java.io.IOException ioe) {
                                System.out.println("WARNING:  exception caught processing directory ["+args[i]+"].  Skipping.");
                            }
                        } else {
                            filesToPush.addElement(f);
                        }
                    }

                }

            }

            if ("".equals(collection)) {
                usage();
                System.exit(1);
            }
            if (filesToPush.isEmpty()) {
                usage();
                System.exit(1);
            }

            if (verbose) {
                System.out.println("Using files:");
                for (Enumeration e = filesToPush.elements() ; e.hasMoreElements() ; ) {
                        try {
                            File f = (File) e.nextElement();
                            System.out.println("\t"+f.getCanonicalPath());
                        } catch (java.io.IOException ioe) {
                            System.out.println("EXCEPTION: caught printing canonical path of input filename\n"+ioe);
                        }
                    }
            }
        }


        System.out.println("\nUsing collection=["+collection+"]");
        System.out.println("Using endpoint=["+endpoint+"]");


        if (verbose) System.out.print("Create service...");
        VelocityService vs = null;
        try {
            vs = new VelocityService( new URL(endpoint+"wsdl=1&specialize-for=&use-types=true&"), 
				      new QName("urn:/velocity", "VelocityService"));
        } catch (java.net.MalformedURLException e) {
            System.out.println("ERROR:\n"+e);
            System.exit(1);
        }
        if (verbose) System.out.println(" done.");
        
        if (verbose) System.out.print("Get port from service...");
        VelocityPort vp = vs.getVelocityPort();
        if (verbose) System.out.println(" done.");

	if (verbose) System.out.print("Set port's endpoint...");
	((javax.xml.ws.BindingProvider) vp).getRequestContext().put(javax.xml.ws.BindingProvider.ENDPOINT_ADDRESS_PROPERTY, 
								    endpoint);
	if (verbose) System.out.println(" done.");
        
        if (verbose) System.out.print("Initialize authentication object...");
        Authentication authentication = new Authentication();
        authentication.setUsername("apitestuser");
        authentication.setPassword("testpw");
        if (verbose) System.out.println(" done.");


        /* The crawler should start automatically when we enqueue if it is not running,
           but let's get the status first to make sure the collection exists
        */
        if (verbose) System.out.print("Initialize search collection status...");
        SearchCollectionStatus scs = new SearchCollectionStatus();
        scs.setAuthentication(authentication);
        scs.setCollection(collection);
        if (verbose) System.out.println(" done.");

        SearchCollectionStatusResponse scsr = null;
        VseStatus vses = null;
        VseIndexStatus vseis = null;
        CrawlerStatus cs = null;
        try {
            if (verbose) System.out.print("Request status of ["+collection+"]...");
            scsr = vp.searchCollectionStatus(scs);
            if (verbose) System.out.println(" done.");
            if (scsr == null) {
                System.out.println("Collection ["+collection+"] exists, but has no status.");
            } else {
                vses = scsr.getVseStatus();
                vseis = vses.getVseIndexStatus();
                cs = vses.getCrawlerStatus();
                System.out.println("This collection's crawler is ["+cs.getServiceStatus()+"].");
                System.out.println("This collection's indexer is ["+vseis.getServiceStatus()+"].");
            }
        } catch (javax.xml.ws.soap.SOAPFaultException e) {
            if (e.getFault().getFaultString().contains("search-collection-invalid-name") ) {
                System.out.println("\n\n\tWARNING: collection ["+collection+"] does not exist.");
                System.out.print("\tWARNING: create ["+collection+"] based on [default-push]...");
                /* [default-push] is the default base collection when creating a new collection via the api */
                SearchCollectionCreate scc = new SearchCollectionCreate();
                scc.setAuthentication(authentication);
                scc.setCollection(collection);
                vp.searchCollectionCreate(scc);
                System.out.println(" done.");
            } else {
                System.out.println("\n\n\tWARNING: "+e);
                System.out.println("\nproceed anyway...");
            }
        } catch (java.lang.Exception e) {
            System.out.println("\nERROR: \n"+e);
            System.exit(2);
        }

        if (verbose) System.out.print("Initialize search collection enqueue...");
        SearchCollectionEnqueue sce = new SearchCollectionEnqueue();
        SearchCollectionEnqueue.CrawlUrls urlsToEnqueue = 
	    new SearchCollectionEnqueue.CrawlUrls();
        sce.setAuthentication(authentication);
        sce.setCollection(collection);
        sce.setCrawlUrls(urlsToEnqueue);
        if (verbose) System.out.println(" done.");

        java.util.List<CrawlUrl> crawlUrlList = urlsToEnqueue.getCrawlUrl();
        XMLInputFactory xmlif = XMLInputFactory.newInstance();

        for (Enumeration e = filesToPush.elements() ; e.hasMoreElements() ; ) {
            try {
                File f = (File) e.nextElement();
                XMLStreamReader xmlsr = xmlif.createXMLStreamReader(new FileInputStream(f));
                
                if (verbose) System.out.print("Build data to enqueue for ["+f.getCanonicalPath()+"]...\n");
                
                /* all documents need a url, if not for crawling, then for later updates or deletes */
                String urlString = "myproto://doc?id="+f.hashCode();
                CrawlUrl crawlUrl = new CrawlUrl();
                crawlUrl.setUrl(urlString);
                crawlUrl.setStatus("complete");
                crawlUrl.setEnqueueType("reenqueued");

                crawlUrlList.add(crawlUrl);
                
                java.util.List<CrawlData> crawlDataList = crawlUrl.getCrawlData();
                CrawlData crawlData0 = new CrawlData();
                CrawlData.Vxml vxml0 = new CrawlData.Vxml();
                crawlData0.setContentType("application/vxml");
                crawlData0.setVxml(vxml0);
                crawlDataList.add(crawlData0);
                
                java.util.List<Document> documentList = vxml0.getDocument();
                Document d0 = new Document();
                documentList.add(d0);

                java.util.List<Content> contentList = d0.getContent();
                
                /* read in the xml file and insert data into the crawlDataList for this crawlUrl */
                while(xmlsr.hasNext()) {

                    String localname = null;

		    if ( xmlsr.isStartElement()) {
			localname = xmlsr.getLocalName();
		    }

                    if ( "document".equals(localname) && xmlsr.isStartElement() ) {
                        /* we are at the start of a document */
                        if (verbose) System.out.println("\t"+localname);

                    }

                    if ( "content".equals(localname) && xmlsr.isStartElement() ) {
                        /* we are at the start of a content */
                        Content c0 = new Content();
                        contentList.add(c0);

                        if ( "snippet".equals(xmlsr.getAttributeValue(null, "name")) ) {
                            String contentName = xmlsr.getAttributeValue(null, "name");
                            String contentAction = xmlsr.getAttributeValue(null, "action");
                            String contentWeight = xmlsr.getAttributeValue(null, "weight");
                            String text = xmlsr.getElementText();

                            /* create the searchable, not displayable, content */
                            c0.setName(contentName);
                            if (verbose) System.out.println("\t\tcontent "+contentName);
                            
                            c0.setAction("index-only");
                            if (verbose) System.out.println("\t\t\taction index-only");

                            if ( contentWeight != null ) {
                                c0.setWeight(Double.valueOf(contentWeight));
                                if (verbose) System.out.println("\t\t\tweight "+contentWeight);
                            }
                            
                            c0.setValue(text);
                            if (verbose) System.out.println("\t\t\t\t"+text);

                            /* create the displayable, not searchable, content */
                            Content c1 = new Content();
                            contentList.add(c1);
                            
                            c1.setName(contentName);
                            if (verbose) System.out.println("\t\tcontent "+contentName);
                            
                            if ( contentAction != null ) {
                                c1.setAction(contentAction);
                                if (verbose) System.out.println("\t\t\taction "+contentAction);
                            }

                            c1.setWeight( (double) -1 );
                            if (verbose) System.out.println("\t\t\tweight -1");
                            
                            String shortText = text.substring(0, 20);
                            c1.setValue(shortText);
                            if (verbose) System.out.println("\t\t\t\t"+shortText);

                        } else {
                            /* This is not the main body of searchable text, so process normally */
                            c0.setName(xmlsr.getAttributeValue(null, "name"));
                            if (verbose) System.out.println("\t\tcontent "+xmlsr.getAttributeValue(null, "name"));
                            
                            if ( xmlsr.getAttributeValue(null, "action") != null ) {
                                c0.setAction(xmlsr.getAttributeValue(null, "action"));
                                if (verbose) System.out.println("\t\t\taction "+xmlsr.getAttributeValue(null, "action"));
                            }
                            if ( xmlsr.getAttributeValue(null, "weight") != null ) {
                                c0.setWeight(Double.valueOf(xmlsr.getAttributeValue(null, "weight")));
                                if (verbose) System.out.println("\t\t\tweight "+xmlsr.getAttributeValue(null, "weight"));
                            }
                            
                            String text = xmlsr.getElementText();
                            c0.setValue(text);
                            if (verbose) System.out.println("\t\t\t\t"+text);
                        }
                    }
                    xmlsr.next();
                }
            } catch (java.io.FileNotFoundException fnfe) {
                System.out.println("\nERROR The file disappeared?!?");
            } catch (javax.xml.stream.XMLStreamException xmlse) {
                System.out.println("\nERROR xml stream exception");
            } catch (java.io.IOException ioe) {
                System.out.println("\nERROR IO Exception");
            }
        }

        if (verbose) System.out.print("Enqueue data...");
        SearchCollectionEnqueueResponse enqresp = vp.searchCollectionEnqueue(sce);
        if (verbose) System.out.println(" done.");

        try {
            Thread.sleep(4000);
        } catch (java.lang.InterruptedException ie) {}

        if (verbose) System.out.print("Request status of ["+collection+"]...");
        scsr = vp.searchCollectionStatus(scs);
        if (verbose) System.out.println(" done.");
        /* reuse objects already created for initial status check */
        vses = scsr.getVseStatus();
        vseis = vses.getVseIndexStatus();
        cs = vses.getCrawlerStatus();
        /* see schema documentation of vse-status, crawler-status, and vse-index-status for more detail */
        System.out.println("|******** Begin status of ["+collection+"] ********|");
        System.out.println("|* Crawler:");
        System.out.println("|* \tversion\t\t\t\t"+cs.getVersion());
        System.out.println("|* \tcrawler is\t\t\t"+cs.getServiceStatus());
        System.out.println("|* \tunique URLs input\t\t"+cs.getNInput());
        System.out.println("|* \tunique URLs output\t\t"+cs.getNOutput());
        System.out.println("|* \tfetch or conversion errors\t"+cs.getNErrors());
        System.out.println("|* \tunique URLs pending\t\t"+cs.getNPending());
        System.out.println("|* \tbytes crawled\t\t\t"+cs.getNBytes());
        System.out.println("|* \tbytes downloaded\t\t"+cs.getNDlBytes());
        System.out.println("|* \tbytes of converted data\t\t"+cs.getConvertedSize());

        System.out.println("|* Indexer:");
        System.out.println("|* \tversion\t\t\t\t"+vseis.getServiceVersion());
        System.out.println("|* \tindexer is\t\t\t"+vseis.getServiceStatus());
        System.out.println("|* \tindexed URLs\t\t\t"+vseis.getIndexedUrls());
        System.out.println("|* \tindexed documents\t\t"+vseis.getIndexedDocs());
        System.out.println("|* \tvalid documents\t\t\t"+vseis.getNDocs());
        System.out.println("|* \tindexed contents\t\t"+vseis.getIndexedContents());
        System.out.println("|* \tindexed bytes\t\t\t"+vseis.getIndexedBytes());
        System.out.println("|* \truntime (seconds)\t\t"+vseis.getRunningTime());
        System.out.println("|* \terror items\t\t\t"+vseis.getErrorItems());
        System.out.println("|********  End status of ["+collection+"]  ********|");
        System.out.println();


        if (verbose) System.out.println("\nexiting");
        
    }
}