IBM Content Analytics with Enterprise Search, Version 3.0.0                  

Sample plug-in application for non-web crawlers

The sample crawler plug-in application shows how you can change security token values, metadata, and the content of crawled documents.

package sample;

import java.io.BufferedWriter;
import java.io.OutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;

import com.ibm.es.crawler.plugin.AbstractCrawlerPlugin;
import com.ibm.es.crawler.plugin.Content;
import com.ibm.es.crawler.plugin.CrawledData;
import com.ibm.es.crawler.plugin.CrawlerPluginException;
import com.ibm.es.crawler.plugin.FieldMetadata;

/**
 * The <code>MyCrawlerPlugin</code> is a sample crawler plugin module.
 */
public class MyCrawlerPlugin extends AbstractCrawlerPlugin {

   
   /**
    * Default constructor.
    */
   public MyCrawlerPlugin() {
      super();
   }

   /**
    * Initialize this object.
    * 
    * This sample program has nothing in this method.
    * 
    * @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#init()
    */
   public void init() throws CrawlerPluginException {

      /*
       * [Tips]
       * If your crawler plugin module requires something to do for 
       * initialization, add the code here.
       * [Example]
       * Get JDBC connection for your local system.
       * connection = DriverManager.getConnection("jdbc::db2::xxxx);
       */

   }

   /**
    * Returns the Boolean value for metadata usage.
    * 
    * This sample program returns <code>true</code>.
    *  
    * @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#isMetadataUsed()
    */
   public boolean isMetadataUsed() {

      /*
       * [Tips]
       * If your crawler plugin module updates both metadata and security 
       * tokens, returns true.
       * If your cralwer plugin module updates security tokens only, 
       * returns false.
       * [Example]
       * Close JDBC connection for your local system.
       * connection.close(); 
       */
      return true;
   }

   /**
    * Terminate this object.
    * 
    * This sample program has nothing in this method.
    * 
    * @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#term()
    */
   public void term() throws CrawlerPluginException {

      /*
       * [Tips]
       * If your crawler plugin module requires something to do 
       * for termination, add the code here.
       */

      return;

   }

   /**
    * Update crawled data.
    * 
    * This sample program updates the security tokens.
    * 
    * @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#updateDocument
      (com.ibm.es.crawler.plugin.CrawledData)
    */
   public CrawledData updateDocument(CrawledData crawledData) 
   throws CrawlerPluginException {

      // Get uri string, security tokens, and field metadata
      String url = crawledData.getURI();
      String securityTokens = crawledData.getSecurityTokens();
      List metadataList = crawledData.getMetadataList();
      if (metadataList == null) {
         metadataList = new ArrayList();
      }

      /*
       * [Tips]
       * If your crawler plugin module rejects some crawled data,
       * add the check code here and returns null. 
       */
      // This sample always returns updated document.
      if (false) {
         return null;
      }

      /*
       * [Tips]
       * If your crawler plugin module updates the security tokens,
       * add the code here.
       */
      // update security token (for sample)
      String newToken = "SampleToken";
      String newSecurityTokens = securityTokens + "," + newToken;
      crawledData.setSecurityTokes(newSecurityTokens);

      /*
       * [Tips]
       * If your crawler plugin module updates metadata,
       * add the code here.
       */
      // update metadata (for sample)
      FieldMetadata newFieldMetaData = new FieldMetadata("copyright", "IBM");
      metadataList.add(newFieldMetaData);
      crawledData.setMetadataList(metadataList);
      
      
      /*
       * Set language. 
       */
      crawledData.setLanguage("en");
      crawledData.setLanguageAutoDetection(true);
      
      /*
       * Update Content. since 8.3
       */
      Content content = crawledData.getOriginalContent();
      
      java.io.InputStream in = null;
      
      try{
         // if the original crawled content is null, create the new content.
         if(content == null){
            crawledData.createNewContent();
            content = crawledData.createNewContent();
         } else {
            // if the original crawled content exists, get InputStream 
            // object to access it.
            in = content.getInputStream();
            
            // read the content
            
            in.close();
         }
      }catch(IOException ioe){
         throw new CrawlerPluginException(ioe);
      }
      
      // set information against the content.
      content.setCodepage("UTF-8");
      content.setCodepageAutoDetection(true);
      content.setMimeType("text/plain");

// Overwrite the content.
      try{
         
         OutputStream outputStream = content.getOutputStream();

         // write content to OutputStream
         String newText = "The new content of plain text ";
         BufferedWriter br = new BufferedWriter(new OutputStreamWriter
         (outputStream, "UTF-8"));
         br.write(newText);
         br.flush();
         br.close();
         
      }catch(IOException ioe){
         throw new CrawlerPluginException(ioe);
      }
      
      // Submit change for the content.
      crawledData.submitContent(content);
      
      return crawledData;
   }
 
   /* (non-Javadoc)
    * @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#isContentUsed()
    */
   public boolean isContentUsed() {
      return true;
   }

}

Feedback

Last updated: May 2012

© Copyright IBM Corporation 2004, 2012.
This information center is powered by Eclipse technology. (http://www.eclipse.org)