The sample crawler plug-in application shows how you can change security token values, metadata, and the content of crawled documents.
package sample;
import java.io.BufferedWriter;
import java.io.OutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import com.ibm.es.crawler.plugin.AbstractCrawlerPlugin;
import com.ibm.es.crawler.plugin.Content;
import com.ibm.es.crawler.plugin.CrawledData;
import com.ibm.es.crawler.plugin.CrawlerPluginException;
import com.ibm.es.crawler.plugin.FieldMetadata;
/**
* The <code>MyCrawlerPlugin</code> is a sample crawler plugin module.
*/
public class MyCrawlerPlugin extends AbstractCrawlerPlugin {
/**
* Default constructor.
*/
public MyCrawlerPlugin() {
super();
}
/**
* Initialize this object.
*
* This sample program has nothing in this method.
*
* @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#init()
*/
public void init() throws CrawlerPluginException {
/*
* [Tips]
* If your crawler plugin module requires something to do for
* initialization, add the code here.
* [Example]
* Get JDBC connection for your local system.
* connection = DriverManager.getConnection("jdbc::db2::xxxx);
*/
}
/**
* Returns the Boolean value for metadata usage.
*
* This sample program returns <code>true</code>.
*
* @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#isMetadataUsed()
*/
public boolean isMetadataUsed() {
/*
* [Tips]
* If your crawler plugin module updates both metadata and security
* tokens, returns true.
* If your cralwer plugin module updates security tokens only,
* returns false.
* [Example]
* Close JDBC connection for your local system.
* connection.close();
*/
return true;
}
/**
* Terminate this object.
*
* This sample program has nothing in this method.
*
* @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#term()
*/
public void term() throws CrawlerPluginException {
/*
* [Tips]
* If your crawler plugin module requires something to do
* for termination, add the code here.
*/
return;
}
/**
* Update crawled data.
*
* This sample program updates the security tokens.
*
* @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#updateDocument
(com.ibm.es.crawler.plugin.CrawledData)
*/
public CrawledData updateDocument(CrawledData crawledData)
throws CrawlerPluginException {
// Get uri string, security tokens, and field metadata
String url = crawledData.getURI();
String securityTokens = crawledData.getSecurityTokens();
List metadataList = crawledData.getMetadataList();
if (metadataList == null) {
metadataList = new ArrayList();
}
/*
* [Tips]
* If your crawler plugin module rejects some crawled data,
* add the check code here and returns null.
*/
// This sample always returns updated document.
if (false) {
return null;
}
/*
* [Tips]
* If your crawler plugin module updates the security tokens,
* add the code here.
*/
// update security token (for sample)
String newToken = "SampleToken";
String newSecurityTokens = securityTokens + "," + newToken;
crawledData.setSecurityTokes(newSecurityTokens);
/*
* [Tips]
* If your crawler plugin module updates metadata,
* add the code here.
*/
// update metadata (for sample)
FieldMetadata newFieldMetaData = new FieldMetadata("copyright", "IBM");
metadataList.add(newFieldMetaData);
crawledData.setMetadataList(metadataList);
/*
* Set language.
*/
crawledData.setLanguage("en");
crawledData.setLanguageAutoDetection(true);
/*
* Update Content. since 8.3
*/
Content content = crawledData.getOriginalContent();
java.io.InputStream in = null;
try{
// if the original crawled content is null, create the new content.
if(content == null){
crawledData.createNewContent();
content = crawledData.createNewContent();
} else {
// if the original crawled content exists, get InputStream
// object to access it.
in = content.getInputStream();
// read the content
in.close();
}
}catch(IOException ioe){
throw new CrawlerPluginException(ioe);
}
// set information against the content.
content.setCodepage("UTF-8");
content.setCodepageAutoDetection(true);
content.setMimeType("text/plain");
// Overwrite the content.
try{
OutputStream outputStream = content.getOutputStream();
// write content to OutputStream
String newText = "The new content of plain text ";
BufferedWriter br = new BufferedWriter(new OutputStreamWriter
(outputStream, "UTF-8"));
br.write(newText);
br.flush();
br.close();
}catch(IOException ioe){
throw new CrawlerPluginException(ioe);
}
// Submit change for the content.
crawledData.submitContent(content);
return crawledData;
}
/* (non-Javadoc)
* @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#isContentUsed()
*/
public boolean isContentUsed() {
return true;
}
}